{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 14.0, "eval_steps": 500, "global_step": 9520, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/mean_length": 484.3125, "completions/min_length": 384.0, "epoch": 0.0014705882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 1.3758432865142822, "kl": 0.0, "learning_rate": 7.352941176470588e-10, "loss": -3.725290298461914e-08, "reward": 0.26081252098083496, "reward_std": 0.17200373113155365, "rewards/DrugCombAccuracyCOTORM/mean": 0.07796874642372131, "rewards/DrugCombAccuracyCOTORM/std": 0.25350743532180786, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.0625, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 421.8125, "completions/min_length": 379.0, "epoch": 0.0029411764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.4705882352941176e-09, "loss": 0.0, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 444.375, "completions/min_length": 401.0, "epoch": 0.004411764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021577109582722187, "kl": 0.0004868519527576609, "learning_rate": 2.2058823529411765e-09, "loss": 4.862614332523663e-06, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 467.1875, "completions/min_length": 399.0, "epoch": 0.0058823529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.8923552632331848, "kl": 0.0008477500814478844, "learning_rate": 2.941176470588235e-09, "loss": 8.553266525268555e-06, "reward": 0.512499988079071, "reward_std": 0.0353553369641304, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 445.0, "completions/min_length": 330.0, "epoch": 0.007352941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9200132489204407, "kl": 0.0016757839766796678, "learning_rate": 3.676470588235294e-09, "loss": 1.689046621322632e-05, "reward": 0.5625, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 453.0625, "completions/min_length": 361.0, "epoch": 0.008823529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8805301189422607, "kl": 0.0019120124343317002, "learning_rate": 4.411764705882353e-09, "loss": 1.9033137505175546e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 445.3125, "completions/min_length": 393.0, "epoch": 0.010294117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.1012688875198364, "kl": 0.002634360105730593, "learning_rate": 5.147058823529412e-09, "loss": 2.602487802505493e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 438.5625, "completions/min_length": 346.0, "epoch": 0.011764705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.4102001190185547, "kl": 0.0022548306442331523, "learning_rate": 5.88235294117647e-09, "loss": 2.246006806672085e-05, "reward": 0.6687500476837158, "reward_std": 0.1944543719291687, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.704154372215271, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 455.6875, "completions/min_length": 383.0, "epoch": 0.013235294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.0364049673080444, "kl": 0.0019404163176659495, "learning_rate": 6.617647058823529e-09, "loss": 1.9419938325881958e-05, "reward": 0.7822916507720947, "reward_std": 0.2232154756784439, "rewards/DrugCombAccuracyCOTORM/mean": 0.7708333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.39849257469177246, "rewards/DrugCombCOTFormatORM/mean": 0.9375, "rewards/DrugCombCOTFormatORM/std": 0.17078252136707306, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.57373046875, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 428.5625, "completions/min_length": 386.0, "epoch": 0.014705882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.1354128122329712, "kl": 0.001724779634969309, "learning_rate": 7.352941176470588e-09, "loss": 1.7255544662475586e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 470.4375, "completions/min_length": 428.0, "epoch": 0.016176470588235296, "frac_reward_zero_std": 0.5, "grad_norm": 1.2795844078063965, "kl": 0.0025562557275407016, "learning_rate": 8.088235294117647e-09, "loss": 2.5558116249158047e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 415.6875, "completions/min_length": 373.0, "epoch": 0.01764705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.005842229817062616, "kl": 0.0019046732631977648, "learning_rate": 8.823529411764706e-09, "loss": 1.8899954739026725e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 435.875, "completions/min_length": 339.0, "epoch": 0.01911764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.015781866386532784, "kl": 0.0021865514863748103, "learning_rate": 9.558823529411765e-09, "loss": 2.195024353568442e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 438.625, "completions/min_length": 384.0, "epoch": 0.020588235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.004857431631535292, "kl": 0.0021517945860978216, "learning_rate": 1.0294117647058823e-08, "loss": 2.1612760974676348e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 444.9375, "completions/min_length": 389.0, "epoch": 0.022058823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 0.9789406061172485, "kl": 0.0021402715938165784, "learning_rate": 1.1029411764705882e-08, "loss": 2.1144747734069824e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/mean_length": 547.375, "completions/min_length": 434.0, "epoch": 0.023529411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.235033392906189, "kl": 0.002466189063852653, "learning_rate": 1.176470588235294e-08, "loss": 2.4452805519104004e-05, "reward": 0.4034999907016754, "reward_std": 0.29356446862220764, "rewards/DrugCombAccuracyCOTORM/mean": 0.27000001072883606, "rewards/DrugCombAccuracyCOTORM/std": 0.3762977719306946, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.1666666567325592, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 486.75, "completions/min_length": 436.0, "epoch": 0.025, "frac_reward_zero_std": 0.5, "grad_norm": 1.1329352855682373, "kl": 0.00190069354721345, "learning_rate": 1.25e-08, "loss": 1.9005430658580735e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 422.4375, "completions/min_length": 356.0, "epoch": 0.026470588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038534458726644516, "kl": 0.001613244297914207, "learning_rate": 1.3235294117647058e-08, "loss": 1.5934703696984798e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 441.0625, "completions/min_length": 395.0, "epoch": 0.027941176470588237, "frac_reward_zero_std": 0.5, "grad_norm": 1.1384934186935425, "kl": 0.00208524230401963, "learning_rate": 1.3970588235294118e-08, "loss": 2.0950232283212245e-05, "reward": 0.6273333430290222, "reward_std": 0.04703797399997711, "rewards/DrugCombAccuracyCOTORM/mean": 0.5550000071525574, "rewards/DrugCombAccuracyCOTORM/std": 0.4665619134902954, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 464.0625, "completions/min_length": 430.0, "epoch": 0.029411764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.007741258945316076, "kl": 0.0018857641553040594, "learning_rate": 1.4705882352941176e-08, "loss": 1.8881695723393932e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/mean_length": 422.75, "completions/min_length": 393.0, "epoch": 0.030882352941176472, "frac_reward_zero_std": 1.0, "grad_norm": 0.004455781541764736, "kl": 0.00196303817210719, "learning_rate": 1.5441176470588234e-08, "loss": 1.9603108739829622e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/mean_length": 503.1875, "completions/min_length": 418.0, "epoch": 0.03235294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9765053391456604, "kl": 0.002160727104637772, "learning_rate": 1.6176470588235295e-08, "loss": 2.1457672119140625e-05, "reward": 0.8916666507720947, "reward_std": 0.07071065902709961, "rewards/DrugCombAccuracyCOTORM/mean": 0.8645833730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.18477964401245117, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 439.1875, "completions/min_length": 389.0, "epoch": 0.033823529411764704, "frac_reward_zero_std": 1.0, "grad_norm": 0.006136398762464523, "kl": 0.0020675828855019063, "learning_rate": 1.691176470588235e-08, "loss": 2.0712388504762203e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 449.5, "completions/min_length": 398.0, "epoch": 0.03529411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.959482729434967, "kl": 0.002337916404940188, "learning_rate": 1.7647058823529412e-08, "loss": 2.3255601263372228e-05, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/mean_length": 485.125, "completions/min_length": 404.0, "epoch": 0.03676470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9810288548469543, "kl": 0.0019970060093328357, "learning_rate": 1.838235294117647e-08, "loss": 2.0050931198056787e-05, "reward": 0.8108452558517456, "reward_std": 0.07709717005491257, "rewards/DrugCombAccuracyCOTORM/mean": 0.7791815400123596, "rewards/DrugCombAccuracyCOTORM/std": 0.2642911970615387, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.14719602465629578, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/mean_length": 478.25, "completions/min_length": 354.0, "epoch": 0.03823529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.067804217338562, "kl": 0.0025883626367431134, "learning_rate": 1.911764705882353e-08, "loss": 2.589067844382953e-05, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 481.875, "completions/min_length": 440.0, "epoch": 0.039705882352941174, "frac_reward_zero_std": 0.5, "grad_norm": 0.9427921772003174, "kl": 0.0020878546638414264, "learning_rate": 1.9852941176470586e-08, "loss": 2.1010637283325195e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 428.5, "completions/min_length": 346.0, "epoch": 0.041176470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.005529171787202358, "kl": 0.0022486053640022874, "learning_rate": 2.0588235294117647e-08, "loss": 2.2374066247721203e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/mean_length": 474.375, "completions/min_length": 402.0, "epoch": 0.04264705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.0428656339645386, "kl": 0.0027250605926383287, "learning_rate": 2.1323529411764707e-08, "loss": 2.7232141292188317e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 462.875, "completions/min_length": 425.0, "epoch": 0.04411764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.028430664911866188, "kl": 0.0029885361436754465, "learning_rate": 2.2058823529411764e-08, "loss": 2.993427187902853e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 468.875, "completions/min_length": 395.0, "epoch": 0.045588235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 0.9306969046592712, "kl": 0.0025035533763002604, "learning_rate": 2.279411764705882e-08, "loss": 2.53153957601171e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 438.6875, "completions/min_length": 405.0, "epoch": 0.047058823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.005466648377478123, "kl": 0.0021205029042903334, "learning_rate": 2.352941176470588e-08, "loss": 2.097571450576652e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 473.8125, "completions/min_length": 425.0, "epoch": 0.04852941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.3738651275634766, "kl": 0.00221079372568056, "learning_rate": 2.4264705882352942e-08, "loss": 2.2135674953460693e-05, "reward": 0.7749999761581421, "reward_std": 0.41661906242370605, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 442.125, "completions/min_length": 371.0, "epoch": 0.05, "frac_reward_zero_std": 0.5, "grad_norm": 0.9419844746589661, "kl": 0.0023167825129348785, "learning_rate": 2.5e-08, "loss": 2.2964068193687126e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 471.5, "completions/min_length": 413.0, "epoch": 0.051470588235294115, "frac_reward_zero_std": 0.5, "grad_norm": 0.8453255891799927, "kl": 0.002396786294411868, "learning_rate": 2.5735294117647056e-08, "loss": 2.4149439923348837e-05, "reward": 0.5734999775886536, "reward_std": 0.05458850413560867, "rewards/DrugCombAccuracyCOTORM/mean": 0.5137500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.5050000548362732, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.6540472507476807, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 487.5, "completions/min_length": 382.0, "epoch": 0.052941176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.010125357657670975, "kl": 0.0022346916957758367, "learning_rate": 2.6470588235294116e-08, "loss": 2.2491680283565074e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/mean_length": 586.75, "completions/min_length": 533.0, "epoch": 0.054411764705882354, "frac_reward_zero_std": 0.0, "grad_norm": 1.1944931745529175, "kl": 0.0017880495579447597, "learning_rate": 2.7205882352941177e-08, "loss": 1.7773360013961792e-05, "reward": 0.5973958373069763, "reward_std": 0.30366483330726624, "rewards/DrugCombAccuracyCOTORM/mean": 0.5104166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.4103690981864929, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.2719528079032898, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 443.3125, "completions/min_length": 397.0, "epoch": 0.05588235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.006058094557374716, "kl": 0.002424095757305622, "learning_rate": 2.7941176470588237e-08, "loss": 2.400306948402431e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 430.4375, "completions/min_length": 389.0, "epoch": 0.057352941176470586, "frac_reward_zero_std": 0.5, "grad_norm": 0.9848595857620239, "kl": 0.0020527675515040755, "learning_rate": 2.867647058823529e-08, "loss": 2.0489096641540527e-05, "reward": 0.75, "reward_std": 0.20701967179775238, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/mean_length": 513.0, "completions/min_length": 458.0, "epoch": 0.058823529411764705, "frac_reward_zero_std": 0.0, "grad_norm": 1.215982437133789, "kl": 0.0018917692068498582, "learning_rate": 2.941176470588235e-08, "loss": 1.905113458633423e-05, "reward": 0.35350000858306885, "reward_std": 0.26379823684692383, "rewards/DrugCombAccuracyCOTORM/mean": 0.20750001072883606, "rewards/DrugCombAccuracyCOTORM/std": 0.326751708984375, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.1666666567325592, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/mean_length": 409.5, "completions/min_length": 372.0, "epoch": 0.060294117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.8462201356887817, "kl": 0.002598672639578581, "learning_rate": 3.014705882352941e-08, "loss": 2.578902422101237e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 472.5, "completions/min_length": 449.0, "epoch": 0.061764705882352944, "frac_reward_zero_std": 0.5, "grad_norm": 0.8955628275871277, "kl": 0.001833127811551094, "learning_rate": 3.088235294117647e-08, "loss": 1.8194317817687988e-05, "reward": 0.8698333501815796, "reward_std": 0.028292693197727203, "rewards/DrugCombAccuracyCOTORM/mean": 0.8477083444595337, "rewards/DrugCombAccuracyCOTORM/std": 0.16454075276851654, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.08606630563735962, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 487.25, "completions/min_length": 402.0, "epoch": 0.06323529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.6850042343139648, "kl": 0.0019289676274638623, "learning_rate": 3.161764705882353e-08, "loss": 1.903623342514038e-05, "reward": 0.5484374761581421, "reward_std": 0.0044194171205163, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 422.375, "completions/min_length": 356.0, "epoch": 0.06470588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.04148115590214729, "kl": 0.004456705995835364, "learning_rate": 3.235294117647059e-08, "loss": 4.4041498767910525e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 456.3125, "completions/min_length": 381.0, "epoch": 0.0661764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.016118060797452927, "kl": 0.002655104355653748, "learning_rate": 3.308823529411764e-08, "loss": 2.6304947823518887e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 452.1875, "completions/min_length": 409.0, "epoch": 0.06764705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.163788914680481, "kl": 0.002040578081505373, "learning_rate": 3.38235294117647e-08, "loss": 2.0513580238912255e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 479.5, "completions/min_length": 423.0, "epoch": 0.06911764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9986541271209717, "kl": 0.001980062574148178, "learning_rate": 3.4558823529411764e-08, "loss": 1.95428729057312e-05, "reward": 0.8708125352859497, "reward_std": 0.054213378578424454, "rewards/DrugCombAccuracyCOTORM/mean": 0.8463281393051147, "rewards/DrugCombAccuracyCOTORM/std": 0.18244221806526184, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.08333335071802139, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 470.875, "completions/min_length": 374.0, "epoch": 0.07058823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.005396522115916014, "kl": 0.0017797397449612617, "learning_rate": 3.5294117647058824e-08, "loss": 1.764923035807442e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/mean_length": 413.4375, "completions/min_length": 372.0, "epoch": 0.07205882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 1.385157823562622, "kl": 0.0031338042463175952, "learning_rate": 3.602941176470588e-08, "loss": 3.078579902648926e-05, "reward": 0.8312499523162842, "reward_std": 0.36911600828170776, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5439056158065796, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 489.875, "completions/min_length": 430.0, "epoch": 0.07352941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.007999125868082047, "kl": 0.002495683584129438, "learning_rate": 3.676470588235294e-08, "loss": 2.4721930458326824e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/mean_length": 517.1875, "completions/min_length": 404.0, "epoch": 0.075, "frac_reward_zero_std": 0.5, "grad_norm": 0.8063914179801941, "kl": 0.0018027882324531674, "learning_rate": 3.75e-08, "loss": 1.827627420425415e-05, "reward": 0.9928571581840515, "reward_std": 0.02020304463803768, "rewards/DrugCombAccuracyCOTORM/mean": 0.9910714626312256, "rewards/DrugCombAccuracyCOTORM/std": 0.0357142835855484, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 481.0625, "completions/min_length": 424.0, "epoch": 0.07647058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.9292656779289246, "kl": 0.0024062044103629887, "learning_rate": 3.823529411764706e-08, "loss": 2.4233795556938276e-05, "reward": 0.6996666789054871, "reward_std": 0.19859345257282257, "rewards/DrugCombAccuracyCOTORM/mean": 0.6662499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.45040538907051086, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.6666666865348816, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 449.5, "completions/min_length": 400.0, "epoch": 0.07794117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.008814355358481407, "kl": 0.0026238110149279237, "learning_rate": 3.897058823529412e-08, "loss": 2.619847873575054e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 494.625, "completions/min_length": 425.0, "epoch": 0.07941176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9231691956520081, "kl": 0.0017864433757495135, "learning_rate": 3.970588235294117e-08, "loss": 1.7777085304260254e-05, "reward": 0.9156249761581421, "reward_std": 0.09122592955827713, "rewards/DrugCombAccuracyCOTORM/mean": 0.90625, "rewards/DrugCombAccuracyCOTORM/std": 0.1717960685491562, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.20155644416809082, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 418.5625, "completions/min_length": 379.0, "epoch": 0.08088235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.216592788696289, "kl": 0.0021157314477022737, "learning_rate": 4.044117647058823e-08, "loss": 2.118450720445253e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 459.75, "completions/min_length": 426.0, "epoch": 0.08235294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.4206777811050415, "kl": 0.002143910765880719, "learning_rate": 4.1176470588235293e-08, "loss": 2.1383166313171387e-05, "reward": 0.6789582967758179, "reward_std": 0.38419705629348755, "rewards/DrugCombAccuracyCOTORM/mean": 0.6312500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.4371403455734253, "rewards/DrugCombCOTFormatORM/mean": 0.9375, "rewards/DrugCombCOTFormatORM/std": 0.17078252136707306, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7708333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.3381595313549042, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/mean_length": 514.75, "completions/min_length": 391.0, "epoch": 0.0838235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8437605500221252, "kl": 0.001935226988280192, "learning_rate": 4.191176470588235e-08, "loss": 1.9237399101257324e-05, "reward": 0.5824305415153503, "reward_std": 0.012630056589841843, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.824305534362793, "rewards/DrugCombCoverageCOTORM/std": 0.25040605664253235, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 458.5625, "completions/min_length": 395.0, "epoch": 0.08529411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.1580392122268677, "kl": 0.0037034167326055467, "learning_rate": 4.2647058823529414e-08, "loss": 3.701439709402621e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 430.875, "completions/min_length": 398.0, "epoch": 0.08676470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.005685754586011171, "kl": 0.0016310779610648751, "learning_rate": 4.338235294117647e-08, "loss": 1.644336589379236e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/mean_length": 574.0625, "completions/min_length": 476.0, "epoch": 0.08823529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.4059923887252808, "kl": 0.0022602081880904734, "learning_rate": 4.411764705882353e-08, "loss": 2.2597740098717622e-05, "reward": 0.7683515548706055, "reward_std": 0.11083678901195526, "rewards/DrugCombAccuracyCOTORM/mean": 0.7198144197463989, "rewards/DrugCombAccuracyCOTORM/std": 0.3495086431503296, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.925000011920929, "rewards/DrugCombCoverageCOTORM/std": 0.12531442940235138, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/mean_length": 408.4375, "completions/min_length": 365.0, "epoch": 0.08970588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 2.6657233238220215, "kl": 0.004416789597598836, "learning_rate": 4.485294117647059e-08, "loss": 4.6096742153167725e-05, "reward": 0.7000000476837158, "reward_std": 0.3989730179309845, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/mean_length": 404.0, "completions/min_length": 347.0, "epoch": 0.09117647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.011429976671934128, "kl": 0.0030704413075000048, "learning_rate": 4.558823529411764e-08, "loss": 3.0509581847582012e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 495.875, "completions/min_length": 435.0, "epoch": 0.09264705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.1511681079864502, "kl": 0.002337416633963585, "learning_rate": 4.63235294117647e-08, "loss": 2.3195132598630153e-05, "reward": 0.593500018119812, "reward_std": 0.0406729094684124, "rewards/DrugCombAccuracyCOTORM/mean": 0.5309374928474426, "rewards/DrugCombAccuracyCOTORM/std": 0.48794543743133545, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.35939764976501465, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 463.875, "completions/min_length": 421.0, "epoch": 0.09411764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.004146087449043989, "kl": 0.001994004618609324, "learning_rate": 4.705882352941176e-08, "loss": 1.9871133190463297e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 474.0, "completions/min_length": 413.0, "epoch": 0.09558823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.102586269378662, "kl": 0.002068264439003542, "learning_rate": 4.779411764705882e-08, "loss": 2.0563602447509766e-05, "reward": 0.7484375238418579, "reward_std": 0.20835641026496887, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.0625, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 487.0625, "completions/min_length": 430.0, "epoch": 0.09705882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.9579600095748901, "kl": 0.0027576752472668886, "learning_rate": 4.8529411764705884e-08, "loss": 2.7366522772354074e-05, "reward": 0.550000011920929, "reward_std": 0.053452249616384506, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 454.75, "completions/min_length": 405.0, "epoch": 0.09852941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.0072653335519135, "kl": 0.002206294419011101, "learning_rate": 4.926470588235294e-08, "loss": 2.214054075011518e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 441.1875, "completions/min_length": 405.0, "epoch": 0.1, "frac_reward_zero_std": 1.0, "grad_norm": 0.013626402243971825, "kl": 0.002545105555327609, "learning_rate": 5e-08, "loss": 2.5395778720849194e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 456.1875, "completions/min_length": 372.0, "epoch": 0.10147058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.006017673294991255, "kl": 0.001885735779069364, "learning_rate": 5.073529411764706e-08, "loss": 1.861130840552505e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 479.5625, "completions/min_length": 410.0, "epoch": 0.10294117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.8956087827682495, "kl": 0.0021462755685206503, "learning_rate": 5.147058823529411e-08, "loss": 2.1617241145577282e-05, "reward": 0.6085000038146973, "reward_std": 0.05560147017240524, "rewards/DrugCombAccuracyCOTORM/mean": 0.5366666913032532, "rewards/DrugCombAccuracyCOTORM/std": 0.48374465107917786, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 464.875, "completions/min_length": 378.0, "epoch": 0.10441176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.012600259855389595, "kl": 0.003563664387911558, "learning_rate": 5.220588235294118e-08, "loss": 3.464183464529924e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 481.125, "completions/min_length": 386.0, "epoch": 0.10588235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.010715271346271038, "kl": 0.003178343002218753, "learning_rate": 5.294117647058823e-08, "loss": 3.1440573366126046e-05, "reward": 0.7016666531562805, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.637499988079071, "rewards/DrugCombAccuracyCOTORM/std": 0.3743883967399597, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.08606630563735962, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 457.5625, "completions/min_length": 389.0, "epoch": 0.10735294117647058, "frac_reward_zero_std": 0.0, "grad_norm": 1.5001341104507446, "kl": 0.002048340713372454, "learning_rate": 5.3676470588235286e-08, "loss": 2.041459083557129e-05, "reward": 0.887499988079071, "reward_std": 0.318198025226593, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 466.3125, "completions/min_length": 402.0, "epoch": 0.10882352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.1282153129577637, "kl": 0.0028109869454056025, "learning_rate": 5.441176470588235e-08, "loss": 2.779066562652588e-05, "reward": 0.887499988079071, "reward_std": 0.21001699566841125, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 453.6875, "completions/min_length": 399.0, "epoch": 0.11029411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 3.1117615699768066, "kl": 0.052468698006123304, "learning_rate": 5.514705882352941e-08, "loss": 0.0005057513481006026, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 444.0625, "completions/min_length": 356.0, "epoch": 0.11176470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.009358135052025318, "kl": 0.0022737440303899348, "learning_rate": 5.5882352941176474e-08, "loss": 2.2651256585959345e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 429.125, "completions/min_length": 363.0, "epoch": 0.11323529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.3763355016708374, "kl": 0.0032002499792724848, "learning_rate": 5.661764705882353e-08, "loss": 3.1694769859313965e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 452.25, "completions/min_length": 411.0, "epoch": 0.11470588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.008503729477524757, "kl": 0.002535105944843963, "learning_rate": 5.735294117647058e-08, "loss": 2.5339335479657166e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 460.3125, "completions/min_length": 396.0, "epoch": 0.1161764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.005278727971017361, "kl": 0.002022715692874044, "learning_rate": 5.808823529411765e-08, "loss": 2.0148168914602138e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 491.375, "completions/min_length": 428.0, "epoch": 0.11764705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 4.162345886230469, "kl": 0.05032421601936221, "learning_rate": 5.88235294117647e-08, "loss": 0.00048768011038191617, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/mean_length": 382.9375, "completions/min_length": 349.0, "epoch": 0.11911764705882352, "frac_reward_zero_std": 1.0, "grad_norm": 0.008234409615397453, "kl": 0.002168791659642011, "learning_rate": 5.9558823529411756e-08, "loss": 2.16927619476337e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 417.1875, "completions/min_length": 351.0, "epoch": 0.12058823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.9837365746498108, "kl": 0.0019000710744876415, "learning_rate": 6.029411764705882e-08, "loss": 1.901603718579281e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 425.75, "completions/min_length": 360.0, "epoch": 0.12205882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.007108614314347506, "kl": 0.002097500691888854, "learning_rate": 6.102941176470588e-08, "loss": 2.084342486341484e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 439.5, "completions/min_length": 355.0, "epoch": 0.12352941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 0.9413732290267944, "kl": 0.0022331357758957893, "learning_rate": 6.176470588235294e-08, "loss": 2.24700452235993e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 507.9375, "completions/min_length": 442.0, "epoch": 0.125, "frac_reward_zero_std": 0.5, "grad_norm": 0.8654562830924988, "kl": 0.0018107768264599144, "learning_rate": 6.25e-08, "loss": 1.8243681552121416e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 439.5625, "completions/min_length": 364.0, "epoch": 0.1264705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 2.15273380279541, "kl": 0.004111401678528637, "learning_rate": 6.323529411764706e-08, "loss": 4.027038812637329e-05, "reward": 0.5375000238418579, "reward_std": 0.46579426527023315, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 426.1875, "completions/min_length": 387.0, "epoch": 0.12794117647058822, "frac_reward_zero_std": 0.0, "grad_norm": 1.4591206312179565, "kl": 0.002138593408744782, "learning_rate": 6.39705882352941e-08, "loss": 2.1539628505706787e-05, "reward": 0.84375, "reward_std": 0.3442630469799042, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 473.125, "completions/min_length": 415.0, "epoch": 0.12941176470588237, "frac_reward_zero_std": 0.0, "grad_norm": 1.9896355867385864, "kl": 0.004027960356324911, "learning_rate": 6.470588235294118e-08, "loss": 4.045665264129639e-05, "reward": 0.3500000238418579, "reward_std": 0.3868754506111145, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.632455587387085, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 461.125, "completions/min_length": 391.0, "epoch": 0.13088235294117648, "frac_reward_zero_std": 0.5, "grad_norm": 1.0433638095855713, "kl": 0.0034172030282206833, "learning_rate": 6.544117647058824e-08, "loss": 3.3718253689585254e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/mean_length": 502.0625, "completions/min_length": 371.0, "epoch": 0.1323529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.00500436220318079, "kl": 0.0021736165217589587, "learning_rate": 6.617647058823529e-08, "loss": 2.1616981030092575e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 464.25, "completions/min_length": 412.0, "epoch": 0.1338235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.2230708599090576, "kl": 0.0025217453949153423, "learning_rate": 6.691176470588235e-08, "loss": 2.557538573455531e-05, "reward": 0.909250020980835, "reward_std": 0.1701066941022873, "rewards/DrugCombAccuracyCOTORM/mean": 0.8904687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.30268827080726624, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.08539126068353653, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 431.75, "completions/min_length": 404.0, "epoch": 0.13529411764705881, "frac_reward_zero_std": 1.0, "grad_norm": 0.004817711189389229, "kl": 0.001839890843257308, "learning_rate": 6.76470588235294e-08, "loss": 1.8398715837975033e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 444.6875, "completions/min_length": 397.0, "epoch": 0.13676470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.0062036411836743355, "kl": 0.002188343816669658, "learning_rate": 6.838235294117648e-08, "loss": 2.1874158846912906e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 469.25, "completions/min_length": 406.0, "epoch": 0.13823529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.006059317383915186, "kl": 0.0019864405621774495, "learning_rate": 6.911764705882353e-08, "loss": 1.9873717974405736e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 430.5625, "completions/min_length": 393.0, "epoch": 0.13970588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.004903667140752077, "kl": 0.001803702296456322, "learning_rate": 6.985294117647059e-08, "loss": 1.803812847356312e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 424.0, "completions/min_length": 338.0, "epoch": 0.1411764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.4894706010818481, "kl": 0.0022890381515026093, "learning_rate": 7.058823529411765e-08, "loss": 2.2784213797422126e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 477.1875, "completions/min_length": 399.0, "epoch": 0.1426470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.6207239627838135, "kl": 0.002193474763771519, "learning_rate": 7.13235294117647e-08, "loss": 2.2038817405700684e-05, "reward": 0.5125000476837158, "reward_std": 0.4153292179107666, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/mean_length": 477.125, "completions/min_length": 364.0, "epoch": 0.14411764705882352, "frac_reward_zero_std": 0.5, "grad_norm": 1.1394480466842651, "kl": 0.0025391404051333666, "learning_rate": 7.205882352941176e-08, "loss": 2.508610486984253e-05, "reward": 0.8606696128845215, "reward_std": 0.19576384127140045, "rewards/DrugCombAccuracyCOTORM/mean": 0.8385323286056519, "rewards/DrugCombAccuracyCOTORM/std": 0.3499637842178345, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8984375, "rewards/DrugCombCoverageCOTORM/std": 0.2630698084831238, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 461.3125, "completions/min_length": 400.0, "epoch": 0.14558823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 1.2561419010162354, "kl": 0.0026138918183278292, "learning_rate": 7.279411764705883e-08, "loss": 2.5993551389547065e-05, "reward": 0.9178333282470703, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.9025000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.26642072200775146, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 422.5, "completions/min_length": 357.0, "epoch": 0.14705882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.0318608283996582, "kl": 0.002421830518869683, "learning_rate": 7.352941176470588e-08, "loss": 2.419830343569629e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 444.5, "completions/min_length": 415.0, "epoch": 0.14852941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.22667445242404938, "kl": 0.007454300706740469, "learning_rate": 7.426470588235294e-08, "loss": 7.3494971729815e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 467.4375, "completions/min_length": 413.0, "epoch": 0.15, "frac_reward_zero_std": 0.5, "grad_norm": 1.200747013092041, "kl": 0.0022757701808586717, "learning_rate": 7.5e-08, "loss": 2.2749174604541622e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 456.375, "completions/min_length": 414.0, "epoch": 0.1514705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.993813157081604, "kl": 0.0027949561190325767, "learning_rate": 7.573529411764704e-08, "loss": 2.8012531402055174e-05, "reward": 0.882437527179718, "reward_std": 0.16225165128707886, "rewards/DrugCombAccuracyCOTORM/mean": 0.8589062690734863, "rewards/DrugCombAccuracyCOTORM/std": 0.30334246158599854, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.953125, "rewards/DrugCombCoverageCOTORM/std": 0.10077822208404541, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 495.625, "completions/min_length": 403.0, "epoch": 0.15294117647058825, "frac_reward_zero_std": 0.0, "grad_norm": 1.5153614282608032, "kl": 0.002698108146432787, "learning_rate": 7.647058823529412e-08, "loss": 2.699717879295349e-05, "reward": 0.4125000238418579, "reward_std": 0.4153292179107666, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 423.875, "completions/min_length": 316.0, "epoch": 0.15441176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.00822522398084402, "kl": 0.002285847906023264, "learning_rate": 7.720588235294118e-08, "loss": 2.2484164219349623e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/mean_length": 387.3125, "completions/min_length": 329.0, "epoch": 0.15588235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.006700050551444292, "kl": 0.002035778685240075, "learning_rate": 7.794117647058824e-08, "loss": 2.0174775272607803e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 471.25, "completions/min_length": 425.0, "epoch": 0.15735294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.011983906850218773, "kl": 0.002965781750390306, "learning_rate": 7.867647058823529e-08, "loss": 2.9775721486657858e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 458.6875, "completions/min_length": 411.0, "epoch": 0.1588235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.2385057210922241, "kl": 0.002728232939261943, "learning_rate": 7.941176470588235e-08, "loss": 2.7089889044873416e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 422.9375, "completions/min_length": 354.0, "epoch": 0.16029411764705884, "frac_reward_zero_std": 1.0, "grad_norm": 0.006620351690798998, "kl": 0.0019296469981782138, "learning_rate": 8.014705882352942e-08, "loss": 1.9172053725924343e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 468.9375, "completions/min_length": 402.0, "epoch": 0.16176470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.005365528166294098, "kl": 0.0020025696721859276, "learning_rate": 8.088235294117647e-08, "loss": 2.0017756469314918e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 419.9375, "completions/min_length": 381.0, "epoch": 0.16323529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.008424651809036732, "kl": 0.00201363381347619, "learning_rate": 8.161764705882353e-08, "loss": 2.01171133085154e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 481.125, "completions/min_length": 444.0, "epoch": 0.16470588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.004083192441612482, "kl": 0.0014318293251562864, "learning_rate": 8.235294117647059e-08, "loss": 1.4251350876293145e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 447.625, "completions/min_length": 378.0, "epoch": 0.1661764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0617592334747314, "kl": 0.0024500166764482856, "learning_rate": 8.308823529411763e-08, "loss": 2.454221248626709e-05, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 476.3125, "completions/min_length": 406.0, "epoch": 0.1676470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.009443193674087524, "kl": 0.0029780791082885116, "learning_rate": 8.38235294117647e-08, "loss": 2.9815277230227366e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 449.6875, "completions/min_length": 394.0, "epoch": 0.16911764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 1.0702173709869385, "kl": 0.002475883287843317, "learning_rate": 8.455882352941177e-08, "loss": 2.486384255462326e-05, "reward": 0.8525000214576721, "reward_std": 0.17300352454185486, "rewards/DrugCombAccuracyCOTORM/mean": 0.8260416984558105, "rewards/DrugCombAccuracyCOTORM/std": 0.3296237289905548, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.18257419764995575, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/mean_length": 483.625, "completions/min_length": 409.0, "epoch": 0.17058823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.005928292870521545, "kl": 0.002455109846778214, "learning_rate": 8.529411764705883e-08, "loss": 2.421467797830701e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 454.9375, "completions/min_length": 396.0, "epoch": 0.17205882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.058260202407837, "kl": 0.001965186296729371, "learning_rate": 8.602941176470588e-08, "loss": 1.973658800125122e-05, "reward": 0.643750011920929, "reward_std": 0.14500616490840912, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 457.3125, "completions/min_length": 379.0, "epoch": 0.17352941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.128021001815796, "kl": 0.0021303846151567996, "learning_rate": 8.676470588235294e-08, "loss": 2.1263957023620605e-05, "reward": 0.6546131372451782, "reward_std": 0.12270642817020416, "rewards/DrugCombAccuracyCOTORM/mean": 0.6086309552192688, "rewards/DrugCombAccuracyCOTORM/std": 0.45417097210884094, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6770833730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5593274831771851, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 427.125, "completions/min_length": 333.0, "epoch": 0.175, "frac_reward_zero_std": 1.0, "grad_norm": 0.013243451714515686, "kl": 0.0026753045385703444, "learning_rate": 8.75e-08, "loss": 2.6448449716554023e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 427.125, "completions/min_length": 388.0, "epoch": 0.17647058823529413, "frac_reward_zero_std": 1.0, "grad_norm": 0.005909573752433062, "kl": 0.0024135520216077566, "learning_rate": 8.823529411764706e-08, "loss": 2.418341682641767e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 464.375, "completions/min_length": 380.0, "epoch": 0.17794117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.2805687189102173, "kl": 0.0020947199664078653, "learning_rate": 8.897058823529412e-08, "loss": 2.111494541168213e-05, "reward": 0.6255833506584167, "reward_std": 0.1592976450920105, "rewards/DrugCombAccuracyCOTORM/mean": 0.5762500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.49902406334877014, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6458333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5896483659744263, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 491.75, "completions/min_length": 463.0, "epoch": 0.17941176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.0484622716903687, "kl": 0.002597773156594485, "learning_rate": 8.970588235294118e-08, "loss": 2.5706496671773493e-05, "reward": 0.8208333253860474, "reward_std": 0.15281745791435242, "rewards/DrugCombAccuracyCOTORM/mean": 0.7916666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.22360680997371674, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 439.0, "completions/min_length": 388.0, "epoch": 0.18088235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1338322162628174, "kl": 0.0027525483747012913, "learning_rate": 9.044117647058822e-08, "loss": 2.7579060770221986e-05, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/mean_length": 475.0625, "completions/min_length": 383.0, "epoch": 0.18235294117647058, "frac_reward_zero_std": 0.0, "grad_norm": 1.18008553981781, "kl": 0.001656693872064352, "learning_rate": 9.117647058823528e-08, "loss": 1.664087176322937e-05, "reward": 0.6625000238418579, "reward_std": 0.3919961452484131, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/mean_length": 501.625, "completions/min_length": 442.0, "epoch": 0.18382352941176472, "frac_reward_zero_std": 0.5, "grad_norm": 0.832512378692627, "kl": 0.0022467210656031966, "learning_rate": 9.191176470588236e-08, "loss": 2.2349408027366735e-05, "reward": 0.5359722375869751, "reward_std": 0.07996458560228348, "rewards/DrugCombAccuracyCOTORM/mean": 0.5249999761581421, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.159722238779068, "rewards/DrugCombCoverageCOTORM/std": 0.9575077295303345, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 445.125, "completions/min_length": 350.0, "epoch": 0.18529411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.006902729626744986, "kl": 0.0026708198711276054, "learning_rate": 9.26470588235294e-08, "loss": 2.655999742273707e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 437.0625, "completions/min_length": 376.0, "epoch": 0.18676470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.009366653859615326, "kl": 0.0020521739206742495, "learning_rate": 9.338235294117647e-08, "loss": 2.0424740796443075e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/mean_length": 479.375, "completions/min_length": 359.0, "epoch": 0.18823529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.5284122228622437, "kl": 0.0021300732623785734, "learning_rate": 9.411764705882353e-08, "loss": 2.12155282497406e-05, "reward": 0.515500009059906, "reward_std": 0.16081106662750244, "rewards/DrugCombAccuracyCOTORM/mean": 0.4529687464237213, "rewards/DrugCombAccuracyCOTORM/std": 0.5018875002861023, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.546875, "rewards/DrugCombCoverageCOTORM/std": 0.807355523109436, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 488.625, "completions/min_length": 451.0, "epoch": 0.18970588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.82599276304245, "kl": 0.0021806363947689533, "learning_rate": 9.485294117647059e-08, "loss": 2.1866653696633875e-05, "reward": 0.75, "reward_std": 0.20701967179775238, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/mean_length": 485.5, "completions/min_length": 396.0, "epoch": 0.19117647058823528, "frac_reward_zero_std": 0.5, "grad_norm": 1.2312129735946655, "kl": 0.002372792805545032, "learning_rate": 9.558823529411763e-08, "loss": 2.3559832698083483e-05, "reward": 0.6713916659355164, "reward_std": 0.08114421367645264, "rewards/DrugCombAccuracyCOTORM/mean": 0.6186666488647461, "rewards/DrugCombAccuracyCOTORM/std": 0.4147202670574188, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7645833492279053, "rewards/DrugCombCoverageCOTORM/std": 0.2919839322566986, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 471.4375, "completions/min_length": 442.0, "epoch": 0.19264705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.004637153819203377, "kl": 0.0017461358802393079, "learning_rate": 9.632352941176471e-08, "loss": 1.7522463167551905e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 455.4375, "completions/min_length": 409.0, "epoch": 0.19411764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8136469721794128, "kl": 0.002196451503550634, "learning_rate": 9.705882352941177e-08, "loss": 2.2098422050476074e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 409.4375, "completions/min_length": 349.0, "epoch": 0.19558823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.007720321416854858, "kl": 0.0021264168608468026, "learning_rate": 9.779411764705881e-08, "loss": 2.136878902092576e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 469.125, "completions/min_length": 415.0, "epoch": 0.19705882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.1870445013046265, "kl": 0.0030197326268535107, "learning_rate": 9.852941176470587e-08, "loss": 3.0294060707092285e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 420.8125, "completions/min_length": 388.0, "epoch": 0.19852941176470587, "frac_reward_zero_std": 1.0, "grad_norm": 0.005714697297662497, "kl": 0.0019586673006415367, "learning_rate": 9.926470588235294e-08, "loss": 1.9521226931829005e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 486.1875, "completions/min_length": 434.0, "epoch": 0.2, "frac_reward_zero_std": 0.5, "grad_norm": 0.889508843421936, "kl": 0.0021247458935249597, "learning_rate": 1e-07, "loss": 2.1246596588753164e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 435.5, "completions/min_length": 383.0, "epoch": 0.20147058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.0057607246562838554, "kl": 0.0018546147912275046, "learning_rate": 1.0073529411764706e-07, "loss": 1.8624637959874235e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/mean_length": 535.4375, "completions/min_length": 451.0, "epoch": 0.20294117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.9715650081634521, "kl": 0.0022088935074862093, "learning_rate": 1.0147058823529412e-07, "loss": 2.1668920453521423e-05, "reward": 0.9127603769302368, "reward_std": 0.07224123179912567, "rewards/DrugCombAccuracyCOTORM/mean": 0.8958333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.15957117080688477, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9609375, "rewards/DrugCombCoverageCOTORM/std": 0.059839196503162384, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/mean_length": 511.625, "completions/min_length": 381.0, "epoch": 0.20441176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.0777270793914795, "kl": 0.0026591648056637496, "learning_rate": 1.0220588235294116e-07, "loss": 2.663327904883772e-05, "reward": 0.2667819559574127, "reward_std": 0.1277277171611786, "rewards/DrugCombAccuracyCOTORM/mean": 0.16407638788223267, "rewards/DrugCombAccuracyCOTORM/std": 0.25943976640701294, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3708333373069763, "rewards/DrugCombCoverageCOTORM/std": 0.4364630877971649, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 441.5625, "completions/min_length": 389.0, "epoch": 0.20588235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.025130510330200195, "kl": 0.002983561047585681, "learning_rate": 1.0294117647058822e-07, "loss": 2.9611321224365383e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 455.625, "completions/min_length": 390.0, "epoch": 0.2073529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.1380006074905396, "kl": 0.0024168826348613948, "learning_rate": 1.036764705882353e-07, "loss": 2.4162232875823975e-05, "reward": 0.6000000238418579, "reward_std": 0.16256865859031677, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.632455587387085, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 450.25, "completions/min_length": 388.0, "epoch": 0.2088235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.012852140702307224, "kl": 0.0029458347125910223, "learning_rate": 1.0441176470588236e-07, "loss": 2.9505563361453824e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 435.1875, "completions/min_length": 417.0, "epoch": 0.21029411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.006925384048372507, "kl": 0.0016825308557599783, "learning_rate": 1.051470588235294e-07, "loss": 1.6846166545292363e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/mean_length": 532.0, "completions/min_length": 476.0, "epoch": 0.21176470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.4323153495788574, "kl": 0.0023253792605828494, "learning_rate": 1.0588235294117647e-07, "loss": 2.3379921913146973e-05, "reward": 0.9082000255584717, "reward_std": 0.22422704100608826, "rewards/DrugCombAccuracyCOTORM/mean": 0.8894166946411133, "rewards/DrugCombAccuracyCOTORM/std": 0.2605910003185272, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9666666984558105, "rewards/DrugCombCoverageCOTORM/std": 0.0942808985710144, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 494.125, "completions/min_length": 405.0, "epoch": 0.21323529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.0074532004073262215, "kl": 0.002369412104599178, "learning_rate": 1.0661764705882353e-07, "loss": 2.3746202714391984e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/mean_length": 384.75, "completions/min_length": 337.0, "epoch": 0.21470588235294116, "frac_reward_zero_std": 1.0, "grad_norm": 0.007559503428637981, "kl": 0.0021622574131470174, "learning_rate": 1.0735294117647057e-07, "loss": 2.1553512851824053e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 434.625, "completions/min_length": 351.0, "epoch": 0.2161764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.7229686975479126, "kl": 0.002835462713846937, "learning_rate": 1.0808823529411765e-07, "loss": 2.850528653652873e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 462.0625, "completions/min_length": 416.0, "epoch": 0.21764705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.008388024754822254, "kl": 0.002678190590813756, "learning_rate": 1.088235294117647e-07, "loss": 2.6641257136361673e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/mean_length": 501.75, "completions/min_length": 423.0, "epoch": 0.21911764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8169358372688293, "kl": 0.0018604509241413325, "learning_rate": 1.0955882352941175e-07, "loss": 1.862929275375791e-05, "reward": 0.8767499923706055, "reward_std": 0.17010116577148438, "rewards/DrugCombAccuracyCOTORM/mean": 0.8537499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.31442803144454956, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.13437095284461975, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/mean_length": 545.125, "completions/min_length": 452.0, "epoch": 0.22058823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.942216694355011, "kl": 0.0025582292291801423, "learning_rate": 1.1029411764705881e-07, "loss": 2.5646888389019296e-05, "reward": 0.9074000120162964, "reward_std": 0.14612823724746704, "rewards/DrugCombAccuracyCOTORM/mean": 0.887374997138977, "rewards/DrugCombAccuracyCOTORM/std": 0.2730875313282013, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9750000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.06831300258636475, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 430.25, "completions/min_length": 385.0, "epoch": 0.22205882352941175, "frac_reward_zero_std": 1.0, "grad_norm": 0.006099768448621035, "kl": 0.0021352571493480355, "learning_rate": 1.1102941176470587e-07, "loss": 2.138155650754925e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 478.125, "completions/min_length": 427.0, "epoch": 0.2235294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.474170446395874, "kl": 0.002828891039825976, "learning_rate": 1.1176470588235295e-07, "loss": 2.8252601623535156e-05, "reward": 0.893750011920929, "reward_std": 0.1971900761127472, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 434.8125, "completions/min_length": 394.0, "epoch": 0.225, "frac_reward_zero_std": 0.5, "grad_norm": 1.1894887685775757, "kl": 0.0020614365639630705, "learning_rate": 1.125e-07, "loss": 2.0584722733474337e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 423.625, "completions/min_length": 393.0, "epoch": 0.22647058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.5169578790664673, "kl": 0.0027376752987038344, "learning_rate": 1.1323529411764706e-07, "loss": 2.7507543563842773e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 487.0625, "completions/min_length": 426.0, "epoch": 0.22794117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.2754772901535034, "kl": 0.0023483873810619116, "learning_rate": 1.1397058823529412e-07, "loss": 2.3461878299713135e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/mean_length": 508.3125, "completions/min_length": 437.0, "epoch": 0.22941176470588234, "frac_reward_zero_std": 0.0, "grad_norm": 1.6715760231018066, "kl": 0.0023643726599402726, "learning_rate": 1.1470588235294116e-07, "loss": 2.3506581783294678e-05, "reward": 0.6818419694900513, "reward_std": 0.26488620042800903, "rewards/DrugCombAccuracyCOTORM/mean": 0.6183179616928101, "rewards/DrugCombAccuracyCOTORM/std": 0.39319998025894165, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.871874988079071, "rewards/DrugCombCoverageCOTORM/std": 0.1505199372768402, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/mean_length": 506.6875, "completions/min_length": 396.0, "epoch": 0.23088235294117648, "frac_reward_zero_std": 0.0, "grad_norm": 1.3242340087890625, "kl": 0.0026536624354775995, "learning_rate": 1.1544117647058824e-07, "loss": 2.633035182952881e-05, "reward": 0.6722181439399719, "reward_std": 0.36813855171203613, "rewards/DrugCombAccuracyCOTORM/mean": 0.6262101531028748, "rewards/DrugCombAccuracyCOTORM/std": 0.4157586991786957, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7124999761581421, "rewards/DrugCombCoverageCOTORM/std": 0.6761410236358643, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 415.4375, "completions/min_length": 337.0, "epoch": 0.2323529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.1858410835266113, "kl": 0.0024050458159763366, "learning_rate": 1.161764705882353e-07, "loss": 2.408838554401882e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/mean_length": 392.1875, "completions/min_length": 349.0, "epoch": 0.2338235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.00769247068092227, "kl": 0.0018433591758366674, "learning_rate": 1.1691176470588234e-07, "loss": 1.8459290004102513e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 442.875, "completions/min_length": 343.0, "epoch": 0.23529411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.7309169769287109, "kl": 0.0015925001061987132, "learning_rate": 1.176470588235294e-07, "loss": 1.5885474567767233e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 516.5, "completions/min_length": 469.0, "epoch": 0.23676470588235293, "frac_reward_zero_std": 0.0, "grad_norm": 1.3547303676605225, "kl": 0.0024454812810290605, "learning_rate": 1.1838235294117646e-07, "loss": 2.390146255493164e-05, "reward": 0.3166666626930237, "reward_std": 0.23878967761993408, "rewards/DrugCombAccuracyCOTORM/mean": 0.1875, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.4714045524597168, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 449.25, "completions/min_length": 357.0, "epoch": 0.23823529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.1212550401687622, "kl": 0.0027560495072975755, "learning_rate": 1.1911764705882351e-07, "loss": 2.8155744075775146e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 455.5625, "completions/min_length": 418.0, "epoch": 0.23970588235294119, "frac_reward_zero_std": 0.5, "grad_norm": 1.0325175523757935, "kl": 0.002376550139160827, "learning_rate": 1.1985294117647059e-07, "loss": 2.3919110390124843e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 447.1875, "completions/min_length": 377.0, "epoch": 0.2411764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.2099727392196655, "kl": 0.0033382404362782836, "learning_rate": 1.2058823529411763e-07, "loss": 3.3589996746741235e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 483.625, "completions/min_length": 411.0, "epoch": 0.2426470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.4179333448410034, "kl": 0.0027060346328653395, "learning_rate": 1.213235294117647e-07, "loss": 2.703070640563965e-05, "reward": 0.7875000238418579, "reward_std": 0.33917659521102905, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 457.25, "completions/min_length": 407.0, "epoch": 0.24411764705882352, "frac_reward_zero_std": 0.0, "grad_norm": 1.8992053270339966, "kl": 0.0026499013765715063, "learning_rate": 1.2205882352941175e-07, "loss": 2.650916576385498e-05, "reward": 0.7124999761581421, "reward_std": 0.44393861293792725, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 430.9375, "completions/min_length": 377.0, "epoch": 0.24558823529411763, "frac_reward_zero_std": 0.0, "grad_norm": 1.4849687814712524, "kl": 0.002469846949679777, "learning_rate": 1.227941176470588e-07, "loss": 2.460181713104248e-05, "reward": 0.762499988079071, "reward_std": 0.4001959264278412, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/mean_length": 554.9375, "completions/min_length": 427.0, "epoch": 0.24705882352941178, "frac_reward_zero_std": 0.0, "grad_norm": 1.5776163339614868, "kl": 0.002072832954581827, "learning_rate": 1.2352941176470587e-07, "loss": 2.0965933799743652e-05, "reward": 0.5356152057647705, "reward_std": 0.31618523597717285, "rewards/DrugCombAccuracyCOTORM/mean": 0.4695189595222473, "rewards/DrugCombAccuracyCOTORM/std": 0.38975954055786133, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6000000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.4560701847076416, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 470.9375, "completions/min_length": 409.0, "epoch": 0.2485294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.223634958267212, "kl": 0.0021298665669746697, "learning_rate": 1.2426470588235295e-07, "loss": 2.1435320377349854e-05, "reward": 0.5625, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 433.0, "completions/min_length": 401.0, "epoch": 0.25, "frac_reward_zero_std": 0.5, "grad_norm": 1.1726359128952026, "kl": 0.002652132825460285, "learning_rate": 1.25e-07, "loss": 2.643915831868071e-05, "reward": 0.4000000059604645, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 477.25, "completions/min_length": 401.0, "epoch": 0.2514705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 1.7632062435150146, "kl": 0.0030918490956537426, "learning_rate": 1.2573529411764704e-07, "loss": 3.0681490898132324e-05, "reward": 0.7301042079925537, "reward_std": 0.37601137161254883, "rewards/DrugCombAccuracyCOTORM/mean": 0.675000011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.4725815951824188, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/mean_length": 493.0, "completions/min_length": 368.0, "epoch": 0.2529411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9795502424240112, "kl": 0.0025329752825200558, "learning_rate": 1.2647058823529412e-07, "loss": 2.5160610675811768e-05, "reward": 0.5577254295349121, "reward_std": 0.07294617593288422, "rewards/DrugCombAccuracyCOTORM/mean": 0.5322262048721313, "rewards/DrugCombAccuracyCOTORM/std": 0.4895923137664795, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.319444477558136, "rewards/DrugCombCoverageCOTORM/std": 0.8441275954246521, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 411.5625, "completions/min_length": 364.0, "epoch": 0.25441176470588234, "frac_reward_zero_std": 0.5, "grad_norm": 1.0216988325119019, "kl": 0.0020163909066468477, "learning_rate": 1.2720588235294116e-07, "loss": 1.9944911400671117e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 476.75, "completions/min_length": 446.0, "epoch": 0.25588235294117645, "frac_reward_zero_std": 0.0, "grad_norm": 1.451029896736145, "kl": 0.0022778218844905496, "learning_rate": 1.279411764705882e-07, "loss": 2.2716820240020752e-05, "reward": 0.40625, "reward_std": 0.39297354221343994, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0625, "rewards/DrugCombCoverageCOTORM/std": 0.9979145526885986, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/mean_length": 410.375, "completions/min_length": 383.0, "epoch": 0.25735294117647056, "frac_reward_zero_std": 1.0, "grad_norm": 0.007557980716228485, "kl": 0.0023899721854832023, "learning_rate": 1.2867647058823528e-07, "loss": 2.389642031630501e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 453.1875, "completions/min_length": 354.0, "epoch": 0.25882352941176473, "frac_reward_zero_std": 0.5, "grad_norm": 1.006169319152832, "kl": 0.0021319001098163426, "learning_rate": 1.2941176470588236e-07, "loss": 2.1286308765411377e-05, "reward": 0.9833333492279053, "reward_std": 0.047140445560216904, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 466.3125, "completions/min_length": 385.0, "epoch": 0.26029411764705884, "frac_reward_zero_std": 0.5, "grad_norm": 0.9399146437644958, "kl": 0.0016998067148961127, "learning_rate": 1.301470588235294e-07, "loss": 1.701402106846217e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 469.6875, "completions/min_length": 388.0, "epoch": 0.26176470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 2.3321123123168945, "kl": 0.0026143354480154812, "learning_rate": 1.3088235294117648e-07, "loss": 2.619498991407454e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/mean_length": 487.3125, "completions/min_length": 394.0, "epoch": 0.26323529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 0.9016780257225037, "kl": 0.0023805417004041374, "learning_rate": 1.3161764705882352e-07, "loss": 2.372264862060547e-05, "reward": 0.674750030040741, "reward_std": 0.12709462642669678, "rewards/DrugCombAccuracyCOTORM/mean": 0.622083306312561, "rewards/DrugCombAccuracyCOTORM/std": 0.44011256098747253, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7708333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5087202787399292, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/mean_length": 486.0, "completions/min_length": 363.0, "epoch": 0.2647058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.0967390537261963, "kl": 0.0021189999824855477, "learning_rate": 1.3235294117647057e-07, "loss": 2.11372971534729e-05, "reward": 0.20000000298023224, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.0625, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 445.8125, "completions/min_length": 394.0, "epoch": 0.2661764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.4444400072097778, "kl": 0.002129115251591429, "learning_rate": 1.3308823529411764e-07, "loss": 2.1204352378845215e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 436.9375, "completions/min_length": 343.0, "epoch": 0.2676470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9272468090057373, "kl": 0.0031707746675238013, "learning_rate": 1.338235294117647e-07, "loss": 3.1357652915176004e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 411.3125, "completions/min_length": 343.0, "epoch": 0.2691176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.8358972072601318, "kl": 0.002366003522183746, "learning_rate": 1.3455882352941177e-07, "loss": 2.3548587705590762e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 455.625, "completions/min_length": 351.0, "epoch": 0.27058823529411763, "frac_reward_zero_std": 0.5, "grad_norm": 0.9824440479278564, "kl": 0.0019395627896301448, "learning_rate": 1.352941176470588e-07, "loss": 1.9252300262451172e-05, "reward": 0.8964166641235352, "reward_std": 0.19718943536281586, "rewards/DrugCombAccuracyCOTORM/mean": 0.8887500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.30663496255874634, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5013870000839233, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 469.0625, "completions/min_length": 424.0, "epoch": 0.27205882352941174, "frac_reward_zero_std": 1.0, "grad_norm": 0.005419933702796698, "kl": 0.002114631119184196, "learning_rate": 1.3602941176470586e-07, "loss": 2.11607912206091e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 439.375, "completions/min_length": 372.0, "epoch": 0.2735294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.2447636127471924, "kl": 0.024750976299401373, "learning_rate": 1.3676470588235296e-07, "loss": 0.0002500563859939575, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 479.5625, "completions/min_length": 388.0, "epoch": 0.275, "frac_reward_zero_std": 0.5, "grad_norm": 0.8598042130470276, "kl": 0.002260232256958261, "learning_rate": 1.375e-07, "loss": 2.2314488887786865e-05, "reward": 0.675000011920929, "reward_std": 0.14880476891994476, "rewards/DrugCombAccuracyCOTORM/mean": 0.59375, "rewards/DrugCombAccuracyCOTORM/std": 0.4905354380607605, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 445.1875, "completions/min_length": 387.0, "epoch": 0.27647058823529413, "frac_reward_zero_std": 1.0, "grad_norm": 0.007210095878690481, "kl": 0.0025139839272014797, "learning_rate": 1.3823529411764705e-07, "loss": 2.495174339856021e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 477.625, "completions/min_length": 442.0, "epoch": 0.27794117647058825, "frac_reward_zero_std": 0.0, "grad_norm": 1.3882232904434204, "kl": 0.002178506925702095, "learning_rate": 1.3897058823529413e-07, "loss": 2.1599233150482178e-05, "reward": 0.9045624732971191, "reward_std": 0.26993799209594727, "rewards/DrugCombAccuracyCOTORM/mean": 0.8904687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.30268827080726624, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.921875, "rewards/DrugCombCoverageCOTORM/std": 0.25361964106559753, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/mean_length": 459.0625, "completions/min_length": 383.0, "epoch": 0.27941176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.8384394645690918, "kl": 0.002814283303450793, "learning_rate": 1.3970588235294117e-07, "loss": 2.8133392333984375e-05, "reward": 0.49262499809265137, "reward_std": 0.4624272584915161, "rewards/DrugCombAccuracyCOTORM/mean": 0.4478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.5045558214187622, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.34375, "rewards/DrugCombCoverageCOTORM/std": 0.9437293410301208, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 461.625, "completions/min_length": 392.0, "epoch": 0.28088235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9578444957733154, "kl": 0.0018605530494824052, "learning_rate": 1.4044117647058822e-07, "loss": 1.849824002420064e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 437.125, "completions/min_length": 398.0, "epoch": 0.2823529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.877996027469635, "kl": 0.002822358423145488, "learning_rate": 1.411764705882353e-07, "loss": 2.8709968319162726e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 437.4375, "completions/min_length": 385.0, "epoch": 0.2838235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0969330072402954, "kl": 0.0020142071007285267, "learning_rate": 1.4191176470588234e-07, "loss": 2.0063114789081737e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 475.5, "completions/min_length": 412.0, "epoch": 0.2852941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.1866179704666138, "kl": 0.0021612031559925526, "learning_rate": 1.426470588235294e-07, "loss": 2.168350874853786e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 478.625, "completions/min_length": 418.0, "epoch": 0.2867647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.1704024076461792, "kl": 0.0028261594707146287, "learning_rate": 1.4338235294117646e-07, "loss": 2.8738406399497762e-05, "reward": 0.768750011920929, "reward_std": 0.24775780737400055, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.6020797491073608, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 456.875, "completions/min_length": 397.0, "epoch": 0.28823529411764703, "frac_reward_zero_std": 1.0, "grad_norm": 0.01003714744001627, "kl": 0.002441651187837124, "learning_rate": 1.441176470588235e-07, "loss": 2.4602224584668875e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 459.5625, "completions/min_length": 405.0, "epoch": 0.2897058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.9650393128395081, "kl": 0.0017862807726487517, "learning_rate": 1.4485294117647058e-07, "loss": 1.7823593225330114e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 449.0, "completions/min_length": 430.0, "epoch": 0.2911764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0498228073120117, "kl": 0.0019757254340220243, "learning_rate": 1.4558823529411766e-07, "loss": 1.9720624550245702e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 469.9375, "completions/min_length": 395.0, "epoch": 0.2926470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.004381035920232534, "kl": 0.002017226826865226, "learning_rate": 1.463235294117647e-07, "loss": 2.013892117247451e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 461.3125, "completions/min_length": 359.0, "epoch": 0.29411764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.011146166361868382, "kl": 0.0025310784694738686, "learning_rate": 1.4705882352941175e-07, "loss": 2.5768447812879458e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 465.5, "completions/min_length": 394.0, "epoch": 0.29558823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.7696278095245361, "kl": 0.0023061379324644804, "learning_rate": 1.4779411764705883e-07, "loss": 2.310425043106079e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 419.5, "completions/min_length": 374.0, "epoch": 0.29705882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.007205713540315628, "kl": 0.0021452361543197185, "learning_rate": 1.4852941176470587e-07, "loss": 2.1397036107373424e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/mean_length": 468.3125, "completions/min_length": 415.0, "epoch": 0.2985294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.008834168314933777, "kl": 0.002434247377095744, "learning_rate": 1.4926470588235292e-07, "loss": 2.4266348191304132e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/mean_length": 505.1875, "completions/min_length": 398.0, "epoch": 0.3, "frac_reward_zero_std": 0.5, "grad_norm": 0.7321112751960754, "kl": 0.001671172067290172, "learning_rate": 1.5e-07, "loss": 1.6587790014455095e-05, "reward": 0.8064732551574707, "reward_std": 0.18921338021755219, "rewards/DrugCombAccuracyCOTORM/mean": 0.7678571343421936, "rewards/DrugCombAccuracyCOTORM/std": 0.3864191174507141, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 432.25, "completions/min_length": 337.0, "epoch": 0.3014705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9518985152244568, "kl": 0.0032054518233053386, "learning_rate": 1.5073529411764704e-07, "loss": 3.203004598617554e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 753.0, "completions/mean_length": 529.125, "completions/min_length": 428.0, "epoch": 0.3029411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.2058011293411255, "kl": 0.002290529228048399, "learning_rate": 1.514705882352941e-07, "loss": 2.263486385345459e-05, "reward": 0.14141666889190674, "reward_std": 0.16028350591659546, "rewards/DrugCombAccuracyCOTORM/mean": 0.043958332389593124, "rewards/DrugCombAccuracyCOTORM/std": 0.1758333444595337, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0625, "rewards/DrugCombCoverageCOTORM/std": 0.9287087917327881, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 485.75, "completions/min_length": 434.0, "epoch": 0.3044117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.044905424118042, "kl": 0.0025770909269340336, "learning_rate": 1.5220588235294116e-07, "loss": 2.5692435883684084e-05, "reward": 0.5625, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 485.875, "completions/min_length": 422.0, "epoch": 0.3058823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 1.5318102836608887, "kl": 0.005208441813010722, "learning_rate": 1.5294117647058823e-07, "loss": 5.2016228437423706e-05, "reward": 0.5919166803359985, "reward_std": 0.22220344841480255, "rewards/DrugCombAccuracyCOTORM/mean": 0.5237500071525574, "rewards/DrugCombAccuracyCOTORM/std": 0.4464707374572754, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7291666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.29891782999038696, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 427.1875, "completions/min_length": 351.0, "epoch": 0.3073529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.6607589721679688, "kl": 0.002341394341783598, "learning_rate": 1.536764705882353e-07, "loss": 2.3290514945983887e-05, "reward": 0.7269999980926514, "reward_std": 0.37084728479385376, "rewards/DrugCombAccuracyCOTORM/mean": 0.6978124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.4644816815853119, "rewards/DrugCombCOTFormatORM/mean": 0.9375, "rewards/DrugCombCOTFormatORM/std": 0.17078252136707306, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.71875, "rewards/DrugCombCoverageCOTORM/std": 0.44604745507240295, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 433.9375, "completions/min_length": 370.0, "epoch": 0.3088235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.006805986166000366, "kl": 0.001821369311073795, "learning_rate": 1.5441176470588236e-07, "loss": 1.821948535507545e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 468.5625, "completions/min_length": 428.0, "epoch": 0.31029411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 5.653364658355713, "kl": 0.05997600877890363, "learning_rate": 1.551470588235294e-07, "loss": 0.0006275017512962222, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 458.5, "completions/min_length": 379.0, "epoch": 0.31176470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.0051030865870416164, "kl": 0.0018517950957175344, "learning_rate": 1.5588235294117648e-07, "loss": 1.857195093180053e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/mean_length": 488.125, "completions/min_length": 388.0, "epoch": 0.31323529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.213776707649231, "kl": 0.0016865635698195547, "learning_rate": 1.5661764705882352e-07, "loss": 1.683831214904785e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 467.5625, "completions/min_length": 423.0, "epoch": 0.31470588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.011287989094853401, "kl": 0.0028446237556636333, "learning_rate": 1.5735294117647057e-07, "loss": 2.8428263249224983e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 419.375, "completions/min_length": 365.0, "epoch": 0.3161764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.00536441570147872, "kl": 0.0018733467732090503, "learning_rate": 1.5808823529411764e-07, "loss": 1.868201252364088e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 478.8125, "completions/min_length": 427.0, "epoch": 0.3176470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8667519092559814, "kl": 0.0020860721997451037, "learning_rate": 1.588235294117647e-07, "loss": 2.0973384380340576e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 478.1875, "completions/min_length": 401.0, "epoch": 0.3191176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.1486530303955078, "kl": 0.0022351035731844604, "learning_rate": 1.5955882352941174e-07, "loss": 2.2213906049728394e-05, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 487.0, "completions/min_length": 440.0, "epoch": 0.3205882352941177, "frac_reward_zero_std": 0.0, "grad_norm": 1.4450234174728394, "kl": 0.00243066597613506, "learning_rate": 1.6029411764705884e-07, "loss": 2.4616718292236328e-05, "reward": 0.6730000376701355, "reward_std": 0.3725161552429199, "rewards/DrugCombAccuracyCOTORM/mean": 0.6016666889190674, "rewards/DrugCombAccuracyCOTORM/std": 0.4441521167755127, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.14907118678092957, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/mean_length": 484.375, "completions/min_length": 360.0, "epoch": 0.3220588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9571201801300049, "kl": 0.0023121163831092417, "learning_rate": 1.6102941176470589e-07, "loss": 2.295977719768416e-05, "reward": 0.6426388025283813, "reward_std": 0.04617663472890854, "rewards/DrugCombAccuracyCOTORM/mean": 0.571267306804657, "rewards/DrugCombAccuracyCOTORM/std": 0.45026230812072754, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.856249988079071, "rewards/DrugCombCoverageCOTORM/std": 0.19311049580574036, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 449.625, "completions/min_length": 347.0, "epoch": 0.3235294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9117065668106079, "kl": 0.0022344561584759504, "learning_rate": 1.6176470588235293e-07, "loss": 2.2273063223110512e-05, "reward": 0.8958333134651184, "reward_std": 0.19287919998168945, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 436.5, "completions/min_length": 396.0, "epoch": 0.325, "frac_reward_zero_std": 1.0, "grad_norm": 0.006299276370555162, "kl": 0.0024059242277871817, "learning_rate": 1.625e-07, "loss": 2.423603473289404e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 445.4375, "completions/min_length": 382.0, "epoch": 0.3264705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.01356850191950798, "kl": 0.0025279815308749676, "learning_rate": 1.6323529411764705e-07, "loss": 2.4562950784456916e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 416.3125, "completions/min_length": 346.0, "epoch": 0.32794117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.007140073459595442, "kl": 0.002296642429428175, "learning_rate": 1.639705882352941e-07, "loss": 2.3124877770897e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 487.0625, "completions/min_length": 386.0, "epoch": 0.32941176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9829716086387634, "kl": 0.0026070193271152675, "learning_rate": 1.6470588235294117e-07, "loss": 2.629234040796291e-05, "reward": 0.375, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/mean_length": 482.3125, "completions/min_length": 372.0, "epoch": 0.33088235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.004995420109480619, "kl": 0.00199366980814375, "learning_rate": 1.6544117647058822e-07, "loss": 1.994280501094181e-05, "reward": 0.800000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 468.0625, "completions/min_length": 375.0, "epoch": 0.3323529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8314732313156128, "kl": 0.0019840911554638296, "learning_rate": 1.6617647058823527e-07, "loss": 1.983344554901123e-05, "reward": 0.9573999643325806, "reward_std": 0.12049099802970886, "rewards/DrugCombAccuracyCOTORM/mean": 0.949874997138977, "rewards/DrugCombAccuracyCOTORM/std": 0.2004999965429306, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9750000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.10000000149011612, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 477.6875, "completions/min_length": 432.0, "epoch": 0.3338235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.6474704742431641, "kl": 0.0019009070238098502, "learning_rate": 1.6691176470588234e-07, "loss": 1.885741949081421e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 477.0, "completions/min_length": 419.0, "epoch": 0.3352941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.006875516846776009, "kl": 0.0021342732943594456, "learning_rate": 1.676470588235294e-07, "loss": 2.125618266290985e-05, "reward": 0.9033333659172058, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.8999999761581421, "rewards/DrugCombAccuracyCOTORM/std": 0.10327955335378647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 476.4375, "completions/min_length": 411.0, "epoch": 0.33676470588235297, "frac_reward_zero_std": 1.0, "grad_norm": 0.0959799587726593, "kl": 0.0035803559003397822, "learning_rate": 1.6838235294117646e-07, "loss": 3.595357702579349e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/mean_length": 497.75, "completions/min_length": 410.0, "epoch": 0.3382352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.8653952479362488, "kl": 0.001940466754604131, "learning_rate": 1.6911764705882354e-07, "loss": 1.9311904907226562e-05, "reward": 0.6939583420753479, "reward_std": 0.126921147108078, "rewards/DrugCombAccuracyCOTORM/mean": 0.6369791626930237, "rewards/DrugCombAccuracyCOTORM/std": 0.42787495255470276, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.18726837635040283, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 433.75, "completions/min_length": 382.0, "epoch": 0.3397058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.1592010259628296, "kl": 0.003469448012765497, "learning_rate": 1.6985294117647058e-07, "loss": 3.4146010875701904e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 435.3125, "completions/min_length": 389.0, "epoch": 0.3411764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.006288998294621706, "kl": 0.0023330822587013245, "learning_rate": 1.7058823529411766e-07, "loss": 2.3256528947968036e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 482.625, "completions/min_length": 415.0, "epoch": 0.3426470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.080105185508728, "kl": 0.002717935887631029, "learning_rate": 1.713235294117647e-07, "loss": 2.7395784854888916e-05, "reward": 0.13750000298023224, "reward_std": 0.1505940705537796, "rewards/DrugCombAccuracyCOTORM/mean": 0.0625, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 423.375, "completions/min_length": 353.0, "epoch": 0.34411764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.006182131357491016, "kl": 0.00191252410877496, "learning_rate": 1.7205882352941175e-07, "loss": 1.915325083245989e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 495.0, "completions/min_length": 435.0, "epoch": 0.34558823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.7501721382141113, "kl": 0.002114014350809157, "learning_rate": 1.7279411764705882e-07, "loss": 2.1690015273634344e-05, "reward": 0.8007187843322754, "reward_std": 0.017323819920420647, "rewards/DrugCombAccuracyCOTORM/mean": 0.7577343583106995, "rewards/DrugCombAccuracyCOTORM/std": 0.2519896924495697, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9453125, "rewards/DrugCombCoverageCOTORM/std": 0.06404344737529755, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 407.375, "completions/min_length": 368.0, "epoch": 0.34705882352941175, "frac_reward_zero_std": 0.5, "grad_norm": 0.9606614112854004, "kl": 0.0020594187371898443, "learning_rate": 1.7352941176470587e-07, "loss": 2.047812449745834e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 405.25, "completions/min_length": 340.0, "epoch": 0.34852941176470587, "frac_reward_zero_std": 0.5, "grad_norm": 1.290548324584961, "kl": 0.0028594510804396123, "learning_rate": 1.7426470588235292e-07, "loss": 2.8782782464986667e-05, "reward": 0.942187488079071, "reward_std": 0.16351844370365143, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 434.8125, "completions/min_length": 400.0, "epoch": 0.35, "frac_reward_zero_std": 0.5, "grad_norm": 1.045328140258789, "kl": 0.002205516240792349, "learning_rate": 1.75e-07, "loss": 2.209097146987915e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 438.3125, "completions/min_length": 386.0, "epoch": 0.3514705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.021522605791687965, "kl": 0.0028166091651655734, "learning_rate": 1.7573529411764704e-07, "loss": 2.860685344785452e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 467.8125, "completions/min_length": 429.0, "epoch": 0.35294117647058826, "frac_reward_zero_std": 1.0, "grad_norm": 0.012930061668157578, "kl": 0.002873487042961642, "learning_rate": 1.764705882352941e-07, "loss": 2.8735401429003105e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/mean_length": 497.6875, "completions/min_length": 434.0, "epoch": 0.35441176470588237, "frac_reward_zero_std": 0.5, "grad_norm": 0.933821439743042, "kl": 0.0021806873264722526, "learning_rate": 1.7720588235294119e-07, "loss": 2.1904706954956055e-05, "reward": 0.7250000238418579, "reward_std": 0.19578900933265686, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.42979326844215393, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 427.75, "completions/min_length": 374.0, "epoch": 0.3558823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.005941944196820259, "kl": 0.001911844126880169, "learning_rate": 1.7794117647058823e-07, "loss": 1.9144810721627437e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 401.25, "completions/min_length": 355.0, "epoch": 0.3573529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.009070244617760181, "kl": 0.002290037431521341, "learning_rate": 1.7867647058823528e-07, "loss": 2.278309693792835e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 475.8125, "completions/min_length": 434.0, "epoch": 0.3588235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9771565198898315, "kl": 0.003534860908985138, "learning_rate": 1.7941176470588235e-07, "loss": 3.5785138607025146e-05, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 453.8125, "completions/min_length": 384.0, "epoch": 0.3602941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.8545901775360107, "kl": 0.002285446273162961, "learning_rate": 1.801470588235294e-07, "loss": 2.2823020117357373e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 418.6875, "completions/min_length": 356.0, "epoch": 0.36176470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.006022380664944649, "kl": 0.0020660020236391574, "learning_rate": 1.8088235294117645e-07, "loss": 2.04567131731892e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 426.6875, "completions/min_length": 385.0, "epoch": 0.36323529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.008100219070911407, "kl": 0.002692230569664389, "learning_rate": 1.8161764705882352e-07, "loss": 2.6833278752746992e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 475.3125, "completions/min_length": 406.0, "epoch": 0.36470588235294116, "frac_reward_zero_std": 0.0, "grad_norm": 1.366974115371704, "kl": 0.0026969568280037493, "learning_rate": 1.8235294117647057e-07, "loss": 2.7023255825042725e-05, "reward": 0.690625011920929, "reward_std": 0.3060084581375122, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.4149966537952423, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.20155644416809082, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 467.125, "completions/min_length": 374.0, "epoch": 0.36617647058823527, "frac_reward_zero_std": 0.5, "grad_norm": 1.0964912176132202, "kl": 0.002749159117229283, "learning_rate": 1.8308823529411762e-07, "loss": 2.7579520974541083e-05, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 461.0, "completions/min_length": 385.0, "epoch": 0.36764705882352944, "frac_reward_zero_std": 0.0, "grad_norm": 1.4447758197784424, "kl": 0.0024025242892093956, "learning_rate": 1.8382352941176472e-07, "loss": 2.4050474166870117e-05, "reward": 0.5874999761581421, "reward_std": 0.381104975938797, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 470.5, "completions/min_length": 400.0, "epoch": 0.36911764705882355, "frac_reward_zero_std": 0.5, "grad_norm": 0.9086419939994812, "kl": 0.0020473000477068126, "learning_rate": 1.8455882352941176e-07, "loss": 2.0466744899749756e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/mean_length": 468.875, "completions/min_length": 414.0, "epoch": 0.37058823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 1.0508449077606201, "kl": 0.0023042277607601136, "learning_rate": 1.852941176470588e-07, "loss": 2.292916178703308e-05, "reward": 0.8464166522026062, "reward_std": 0.2152470499277115, "rewards/DrugCombAccuracyCOTORM/mean": 0.8262500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.3764195442199707, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.3435921370983124, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 427.5, "completions/min_length": 383.0, "epoch": 0.3720588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.008782428689301014, "kl": 0.002584184578154236, "learning_rate": 1.8602941176470588e-07, "loss": 2.609126750030555e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 424.875, "completions/min_length": 364.0, "epoch": 0.3735294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.0637131929397583, "kl": 0.0024865365703590214, "learning_rate": 1.8676470588235293e-07, "loss": 2.4802982807159424e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 454.1875, "completions/min_length": 390.0, "epoch": 0.375, "frac_reward_zero_std": 0.5, "grad_norm": 0.953162670135498, "kl": 0.0027450216584838927, "learning_rate": 1.875e-07, "loss": 2.7334717742633075e-05, "reward": 0.9451667070388794, "reward_std": 0.058619286864995956, "rewards/DrugCombAccuracyCOTORM/mean": 0.9366666674613953, "rewards/DrugCombAccuracyCOTORM/std": 0.11329410970211029, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.07453560829162598, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 458.5, "completions/min_length": 371.0, "epoch": 0.3764705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.008451211266219616, "kl": 0.002528161770896986, "learning_rate": 1.8823529411764705e-07, "loss": 2.481858064129483e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 435.125, "completions/min_length": 408.0, "epoch": 0.3779411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.013997388072311878, "kl": 0.002551566925831139, "learning_rate": 1.889705882352941e-07, "loss": 2.5223131160601042e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 451.1875, "completions/min_length": 399.0, "epoch": 0.37941176470588234, "frac_reward_zero_std": 0.5, "grad_norm": 0.9482811689376831, "kl": 0.0025016844738274813, "learning_rate": 1.8970588235294117e-07, "loss": 2.4978820874821395e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 418.5, "completions/min_length": 374.0, "epoch": 0.38088235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.004933813586831093, "kl": 0.001806618645787239, "learning_rate": 1.9044117647058822e-07, "loss": 1.7935051801032387e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 451.75, "completions/min_length": 403.0, "epoch": 0.38235294117647056, "frac_reward_zero_std": 1.0, "grad_norm": 0.007341957185417414, "kl": 0.0023969602480065078, "learning_rate": 1.9117647058823527e-07, "loss": 2.3873666577856056e-05, "reward": 0.6713333129882812, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6100000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.40279027819633484, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/mean_length": 473.3125, "completions/min_length": 364.0, "epoch": 0.38382352941176473, "frac_reward_zero_std": 0.5, "grad_norm": 0.898857831954956, "kl": 0.0027994848205707967, "learning_rate": 1.9191176470588237e-07, "loss": 2.8267502784729004e-05, "reward": 0.6956250071525574, "reward_std": 0.12307018041610718, "rewards/DrugCombAccuracyCOTORM/mean": 0.6435267925262451, "rewards/DrugCombAccuracyCOTORM/std": 0.4174967110157013, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8080357313156128, "rewards/DrugCombCoverageCOTORM/std": 0.22545304894447327, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 420.5, "completions/min_length": 384.0, "epoch": 0.38529411764705884, "frac_reward_zero_std": 1.0, "grad_norm": 0.010977794416248798, "kl": 0.0025428085937164724, "learning_rate": 1.9264705882352941e-07, "loss": 2.547044277889654e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 480.0, "completions/min_length": 439.0, "epoch": 0.38676470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.34049636125564575, "kl": 0.00539094174746424, "learning_rate": 1.9338235294117646e-07, "loss": 5.3832700359635055e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 443.25, "completions/min_length": 421.0, "epoch": 0.38823529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.011354479938745499, "kl": 0.0025003335904330015, "learning_rate": 1.9411764705882353e-07, "loss": 2.4951237719506025e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 502.125, "completions/min_length": 433.0, "epoch": 0.3897058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.0109094381332397, "kl": 0.0026952283806167543, "learning_rate": 1.9485294117647058e-07, "loss": 2.704206053749658e-05, "reward": 0.40000003576278687, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 450.0, "completions/min_length": 395.0, "epoch": 0.3911764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.008528996258974075, "kl": 0.0019555946346372366, "learning_rate": 1.9558823529411763e-07, "loss": 1.9397422875044867e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 499.0625, "completions/min_length": 453.0, "epoch": 0.3926470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.019817281514406204, "kl": 0.0025303071888629347, "learning_rate": 1.963235294117647e-07, "loss": 2.5406912754988298e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/mean_length": 574.3125, "completions/min_length": 464.0, "epoch": 0.3941176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9162115454673767, "kl": 0.0022040469339117408, "learning_rate": 1.9705882352941175e-07, "loss": 2.184402364946436e-05, "reward": 0.7413889169692993, "reward_std": 0.012397734448313713, "rewards/DrugCombAccuracyCOTORM/mean": 0.6854166984558105, "rewards/DrugCombAccuracyCOTORM/std": 0.32749754190444946, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9305555820465088, "rewards/DrugCombCoverageCOTORM/std": 0.17626672983169556, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/mean_length": 502.9375, "completions/min_length": 376.0, "epoch": 0.39558823529411763, "frac_reward_zero_std": 1.0, "grad_norm": 0.6206061244010925, "kl": 0.009431787097128108, "learning_rate": 1.977941176470588e-07, "loss": 9.280505037168041e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 489.1875, "completions/min_length": 423.0, "epoch": 0.39705882352941174, "frac_reward_zero_std": 1.0, "grad_norm": 0.0071925572119653225, "kl": 0.0022220644168555737, "learning_rate": 1.9852941176470587e-07, "loss": 2.2256132069742307e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/mean_length": 415.4375, "completions/min_length": 371.0, "epoch": 0.3985294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9667515754699707, "kl": 0.0021807892771903425, "learning_rate": 1.9926470588235294e-07, "loss": 2.1750169253209606e-05, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 483.4375, "completions/min_length": 386.0, "epoch": 0.4, "frac_reward_zero_std": 0.5, "grad_norm": 1.025560975074768, "kl": 0.0024952078238129616, "learning_rate": 2e-07, "loss": 2.512335777282715e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 505.5625, "completions/min_length": 447.0, "epoch": 0.40147058823529413, "frac_reward_zero_std": 0.0, "grad_norm": 3.1017510890960693, "kl": 0.007151299680117518, "learning_rate": 2.0073529411764706e-07, "loss": 7.043778896331787e-05, "reward": 0.6711666584014893, "reward_std": 0.33444786071777344, "rewards/DrugCombAccuracyCOTORM/mean": 0.6202083230018616, "rewards/DrugCombAccuracyCOTORM/std": 0.44223085045814514, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.5091751217842102, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 431.0625, "completions/min_length": 366.0, "epoch": 0.40294117647058825, "frac_reward_zero_std": 0.5, "grad_norm": 1.1118686199188232, "kl": 0.0025091050774790347, "learning_rate": 2.014705882352941e-07, "loss": 2.480468901921995e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 428.375, "completions/min_length": 395.0, "epoch": 0.40441176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.9725396037101746, "kl": 0.0020134177175350487, "learning_rate": 2.0220588235294116e-07, "loss": 2.0012235836475156e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/mean_length": 586.1875, "completions/min_length": 486.0, "epoch": 0.40588235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.40010404586792, "kl": 0.0023937442456372082, "learning_rate": 2.0294117647058823e-07, "loss": 2.378039062023163e-05, "reward": 0.5887083411216736, "reward_std": 0.3089774250984192, "rewards/DrugCombAccuracyCOTORM/mean": 0.5158333778381348, "rewards/DrugCombAccuracyCOTORM/std": 0.38790130615234375, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7604166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.2852468192577362, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 463.0625, "completions/min_length": 367.0, "epoch": 0.4073529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.6653575897216797, "kl": 0.0021215808810666203, "learning_rate": 2.0367647058823528e-07, "loss": 2.130866050720215e-05, "reward": 0.8031250238418579, "reward_std": 0.07954950630664825, "rewards/DrugCombAccuracyCOTORM/mean": 0.78125, "rewards/DrugCombAccuracyCOTORM/std": 0.2561737895011902, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.78125, "rewards/DrugCombCoverageCOTORM/std": 0.2561737895011902, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 484.6875, "completions/min_length": 440.0, "epoch": 0.4088235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1149922609329224, "kl": 0.0028706188313663006, "learning_rate": 2.0441176470588233e-07, "loss": 2.8461217880249023e-05, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/mean_length": 430.125, "completions/min_length": 338.0, "epoch": 0.4102941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.4816043376922607, "kl": 0.0024306875420734286, "learning_rate": 2.051470588235294e-07, "loss": 2.3997785319807008e-05, "reward": 0.38749998807907104, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.125, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 455.4375, "completions/min_length": 403.0, "epoch": 0.4117647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.007855715230107307, "kl": 0.0023411840957123786, "learning_rate": 2.0588235294117645e-07, "loss": 2.3351374693447724e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 436.6875, "completions/min_length": 361.0, "epoch": 0.41323529411764703, "frac_reward_zero_std": 1.0, "grad_norm": 0.00902154203504324, "kl": 0.0022606199490837753, "learning_rate": 2.0661764705882352e-07, "loss": 2.25507392315194e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 476.5, "completions/min_length": 388.0, "epoch": 0.4147058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.2910723686218262, "kl": 0.002520682493923232, "learning_rate": 2.073529411764706e-07, "loss": 2.523532384657301e-05, "reward": 0.9056999683380127, "reward_std": 0.17460967600345612, "rewards/DrugCombAccuracyCOTORM/mean": 0.8914999961853027, "rewards/DrugCombAccuracyCOTORM/std": 0.2964784502983093, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.925000011920929, "rewards/DrugCombCoverageCOTORM/std": 0.20493900775909424, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 444.0625, "completions/min_length": 389.0, "epoch": 0.4161764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8348338007926941, "kl": 0.0024073210661299527, "learning_rate": 2.0808823529411764e-07, "loss": 2.411080276942812e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 464.5625, "completions/min_length": 421.0, "epoch": 0.4176470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8961706161499023, "kl": 0.0021127357613295317, "learning_rate": 2.0882352941176472e-07, "loss": 2.112412403221242e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/mean_length": 504.25, "completions/min_length": 415.0, "epoch": 0.41911764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 0.8597204685211182, "kl": 0.003897182992659509, "learning_rate": 2.0955882352941176e-07, "loss": 3.9267546526389197e-05, "reward": 0.656655490398407, "reward_std": 0.06758120656013489, "rewards/DrugCombAccuracyCOTORM/mean": 0.6023593544960022, "rewards/DrugCombAccuracyCOTORM/std": 0.41558682918548584, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7476799488067627, "rewards/DrugCombCoverageCOTORM/std": 0.4961617588996887, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 505.8125, "completions/min_length": 478.0, "epoch": 0.42058823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.941180408000946, "kl": 0.0026759865577332675, "learning_rate": 2.102941176470588e-07, "loss": 2.6332410925533623e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/mean_length": 518.75, "completions/min_length": 445.0, "epoch": 0.42205882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.173124074935913, "kl": 0.002685951447347179, "learning_rate": 2.1102941176470588e-07, "loss": 2.680519173736684e-05, "reward": 0.7505834102630615, "reward_std": 0.20698420703411102, "rewards/DrugCombAccuracyCOTORM/mean": 0.6965749859809875, "rewards/DrugCombAccuracyCOTORM/std": 0.46601971983909607, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9332340955734253, "rewards/DrugCombCoverageCOTORM/std": 0.18437696993350983, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/mean_length": 504.125, "completions/min_length": 398.0, "epoch": 0.4235294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.4243805408477783, "kl": 0.0026407851255498827, "learning_rate": 2.1176470588235293e-07, "loss": 2.6404857635498047e-05, "reward": 0.7085416316986084, "reward_std": 0.34999680519104004, "rewards/DrugCombAccuracyCOTORM/mean": 0.6604166626930237, "rewards/DrugCombAccuracyCOTORM/std": 0.39564645290374756, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8020833730697632, "rewards/DrugCombCoverageCOTORM/std": 0.2803354561328888, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 487.0, "completions/min_length": 444.0, "epoch": 0.425, "frac_reward_zero_std": 0.5, "grad_norm": 1.1083595752716064, "kl": 0.002998066775035113, "learning_rate": 2.1249999999999998e-07, "loss": 3.0183189664967358e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 478.3125, "completions/min_length": 411.0, "epoch": 0.4264705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 1.2947704792022705, "kl": 0.002248745411634445, "learning_rate": 2.1323529411764705e-07, "loss": 2.2452324628829956e-05, "reward": 0.7912946343421936, "reward_std": 0.2929278612136841, "rewards/DrugCombAccuracyCOTORM/mean": 0.7723214626312256, "rewards/DrugCombAccuracyCOTORM/std": 0.34393224120140076, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.734375, "rewards/DrugCombCoverageCOTORM/std": 0.3815402090549469, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 461.375, "completions/min_length": 390.0, "epoch": 0.4279411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.448657512664795, "kl": 0.002662498824065551, "learning_rate": 2.139705882352941e-07, "loss": 2.6516616344451904e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 445.6875, "completions/min_length": 396.0, "epoch": 0.4294117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.014837164431810379, "kl": 0.002833068458130583, "learning_rate": 2.1470588235294114e-07, "loss": 2.7371272153686732e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 468.6875, "completions/min_length": 401.0, "epoch": 0.4308823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.006196896079927683, "kl": 0.002390158682828769, "learning_rate": 2.1544117647058825e-07, "loss": 2.35346542467596e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 454.5, "completions/min_length": 361.0, "epoch": 0.4323529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.0004379749298096, "kl": 0.003816931654000655, "learning_rate": 2.161764705882353e-07, "loss": 3.738701343536377e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 465.25, "completions/min_length": 391.0, "epoch": 0.4338235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9705561399459839, "kl": 0.0031035920546855778, "learning_rate": 2.1691176470588234e-07, "loss": 3.116246807621792e-05, "reward": 0.8999999761581421, "reward_std": 0.10690449178218842, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.22360680997371674, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 453.5, "completions/min_length": 392.0, "epoch": 0.43529411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.306156873703003, "kl": 0.0022910789120942354, "learning_rate": 2.176470588235294e-07, "loss": 2.2608601284446195e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 454.625, "completions/min_length": 426.0, "epoch": 0.43676470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.2622178792953491, "kl": 0.002522514696465805, "learning_rate": 2.1838235294117646e-07, "loss": 2.5306713723693974e-05, "reward": 0.75, "reward_std": 0.20701967179775238, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 415.375, "completions/min_length": 349.0, "epoch": 0.43823529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.0350295305252075, "kl": 0.0024312686873599887, "learning_rate": 2.191176470588235e-07, "loss": 2.4549663066864014e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 509.8125, "completions/min_length": 414.0, "epoch": 0.43970588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.0993808507919312, "kl": 0.0019537272746674716, "learning_rate": 2.1985294117647058e-07, "loss": 1.9463208445813507e-05, "reward": 0.6875, "reward_std": 0.19594095647335052, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 437.25, "completions/min_length": 377.0, "epoch": 0.4411764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.09810856729745865, "kl": 0.004351145616965368, "learning_rate": 2.2058823529411763e-07, "loss": 4.237905523041263e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/mean_length": 526.125, "completions/min_length": 430.0, "epoch": 0.4426470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.5848652124404907, "kl": 0.0022592930181417614, "learning_rate": 2.2132352941176467e-07, "loss": 2.253800630569458e-05, "reward": 0.8259999752044678, "reward_std": 0.29624032974243164, "rewards/DrugCombAccuracyCOTORM/mean": 0.8137500286102295, "rewards/DrugCombAccuracyCOTORM/std": 0.32592177391052246, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.523520827293396, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 449.5, "completions/min_length": 415.0, "epoch": 0.4441176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.3356722593307495, "kl": 0.0023710503010079265, "learning_rate": 2.2205882352941175e-07, "loss": 2.3730462999083102e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 465.9375, "completions/min_length": 395.0, "epoch": 0.4455882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.8895370364189148, "kl": 0.0020802201761398464, "learning_rate": 2.2279411764705882e-07, "loss": 2.0878389477729797e-05, "reward": 0.6482083797454834, "reward_std": 0.166981503367424, "rewards/DrugCombAccuracyCOTORM/mean": 0.6006249785423279, "rewards/DrugCombAccuracyCOTORM/std": 0.4823618233203888, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6770833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.4732424020767212, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 435.25, "completions/min_length": 378.0, "epoch": 0.4470588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.6035960912704468, "kl": 0.003190112009178847, "learning_rate": 2.235294117647059e-07, "loss": 3.241002559661865e-05, "reward": 0.6000000238418579, "reward_std": 0.41403937339782715, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 470.375, "completions/min_length": 406.0, "epoch": 0.4485294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.008924099616706371, "kl": 0.002541204885346815, "learning_rate": 2.2426470588235294e-07, "loss": 2.5458188247284852e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 430.5, "completions/min_length": 365.0, "epoch": 0.45, "frac_reward_zero_std": 1.0, "grad_norm": 0.006303597241640091, "kl": 0.0019061835191678256, "learning_rate": 2.25e-07, "loss": 1.9156028429279104e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/mean_length": 510.0, "completions/min_length": 427.0, "epoch": 0.4514705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9264194965362549, "kl": 0.0022763494635000825, "learning_rate": 2.2573529411764706e-07, "loss": 2.2783875465393066e-05, "reward": 0.659375011920929, "reward_std": 0.21030908823013306, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 0.9375, "rewards/DrugCombCOTFormatORM/std": 0.17078252136707306, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 488.8125, "completions/min_length": 413.0, "epoch": 0.45294117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.09175294637680054, "kl": 0.004838080873014405, "learning_rate": 2.264705882352941e-07, "loss": 4.795386121259071e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 447.125, "completions/min_length": 370.0, "epoch": 0.45441176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.2724874019622803, "kl": 0.0029193109658081084, "learning_rate": 2.2720588235294116e-07, "loss": 2.903677523136139e-05, "reward": 0.6354166269302368, "reward_std": 0.22685523331165314, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.9464848041534424, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 427.875, "completions/min_length": 381.0, "epoch": 0.45588235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.006168652791529894, "kl": 0.0019873811688739806, "learning_rate": 2.2794117647058823e-07, "loss": 1.9764220269280486e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/mean_length": 408.0625, "completions/min_length": 374.0, "epoch": 0.4573529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8678818345069885, "kl": 0.0018836370145436376, "learning_rate": 2.2867647058823528e-07, "loss": 1.8820435798261315e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 424.5, "completions/min_length": 355.0, "epoch": 0.4588235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.011413431726396084, "kl": 0.002912583702709526, "learning_rate": 2.2941176470588233e-07, "loss": 2.9054968763375655e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 503.5, "completions/min_length": 453.0, "epoch": 0.4602941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.014946664683520794, "kl": 0.002901362837292254, "learning_rate": 2.301470588235294e-07, "loss": 2.898427192121744e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/mean_length": 563.1875, "completions/min_length": 510.0, "epoch": 0.46176470588235297, "frac_reward_zero_std": 1.0, "grad_norm": 0.007846995256841183, "kl": 0.002362793864449486, "learning_rate": 2.3088235294117647e-07, "loss": 2.355137257836759e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 476.625, "completions/min_length": 415.0, "epoch": 0.4632352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.0572103261947632, "kl": 0.0023896241327747703, "learning_rate": 2.3161764705882352e-07, "loss": 2.4236738681793213e-05, "reward": 0.512499988079071, "reward_std": 0.0353553369641304, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 459.625, "completions/min_length": 364.0, "epoch": 0.4647058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 2.3382740020751953, "kl": 0.002622348489239812, "learning_rate": 2.323529411764706e-07, "loss": 2.593193858047016e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 450.5, "completions/min_length": 385.0, "epoch": 0.4661764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0218050479888916, "kl": 0.002933660871349275, "learning_rate": 2.3308823529411764e-07, "loss": 2.9397753678495064e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 505.8125, "completions/min_length": 469.0, "epoch": 0.4676470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.7614138126373291, "kl": 0.0016140024818014354, "learning_rate": 2.338235294117647e-07, "loss": 1.614994471310638e-05, "reward": 0.5, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 416.875, "completions/min_length": 369.0, "epoch": 0.46911764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.008342897519469261, "kl": 0.002211188228102401, "learning_rate": 2.3455882352941176e-07, "loss": 2.1982194084557705e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 455.1875, "completions/min_length": 418.0, "epoch": 0.47058823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.9470598697662354, "kl": 0.002490545390173793, "learning_rate": 2.352941176470588e-07, "loss": 2.51084566116333e-05, "reward": 0.5874999761581421, "reward_std": 0.0353553369641304, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 401.4375, "completions/min_length": 348.0, "epoch": 0.47205882352941175, "frac_reward_zero_std": 0.5, "grad_norm": 1.0620039701461792, "kl": 0.0020777161698788404, "learning_rate": 2.3602941176470586e-07, "loss": 2.0440660591702908e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/mean_length": 392.5, "completions/min_length": 317.0, "epoch": 0.47352941176470587, "frac_reward_zero_std": 0.5, "grad_norm": 1.2031278610229492, "kl": 0.0023219154099933803, "learning_rate": 2.3676470588235293e-07, "loss": 2.3240281734615564e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/mean_length": 517.8125, "completions/min_length": 409.0, "epoch": 0.475, "frac_reward_zero_std": 0.5, "grad_norm": 0.7683756947517395, "kl": 0.003340593713801354, "learning_rate": 2.3749999999999998e-07, "loss": 3.386711614439264e-05, "reward": 0.375, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.25, "rewards/DrugCombCoverageCOTORM/std": 1.0, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 465.75, "completions/min_length": 423.0, "epoch": 0.4764705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.0108903646469116, "kl": 0.0024102355528157204, "learning_rate": 2.3823529411764702e-07, "loss": 2.425005368422717e-05, "reward": 0.8374999761581421, "reward_std": 0.22638462483882904, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 478.25, "completions/min_length": 433.0, "epoch": 0.47794117647058826, "frac_reward_zero_std": 0.5, "grad_norm": 1.0224052667617798, "kl": 0.0023107283050194383, "learning_rate": 2.389705882352941e-07, "loss": 2.300739288330078e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 427.5625, "completions/min_length": 391.0, "epoch": 0.47941176470588237, "frac_reward_zero_std": 1.0, "grad_norm": 0.006279993802309036, "kl": 0.001872988708782941, "learning_rate": 2.3970588235294117e-07, "loss": 1.884103039628826e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 469.9375, "completions/min_length": 387.0, "epoch": 0.4808823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.0042842626571655, "kl": 0.0031379261054098606, "learning_rate": 2.4044117647058824e-07, "loss": 3.1035182473715395e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 471.8125, "completions/min_length": 392.0, "epoch": 0.4823529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9085301756858826, "kl": 0.0024219303159043193, "learning_rate": 2.4117647058823526e-07, "loss": 2.4281442165374756e-05, "reward": 0.875, "reward_std": 0.18322508037090302, "rewards/DrugCombAccuracyCOTORM/mean": 0.84375, "rewards/DrugCombAccuracyCOTORM/std": 0.3520771861076355, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/mean_length": 485.8125, "completions/min_length": 375.0, "epoch": 0.4838235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.408856749534607, "kl": 0.002256676962133497, "learning_rate": 2.4191176470588234e-07, "loss": 2.2338565031532198e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 457.9375, "completions/min_length": 412.0, "epoch": 0.4852941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.4300118684768677, "kl": 0.002320505474926904, "learning_rate": 2.426470588235294e-07, "loss": 2.329424023628235e-05, "reward": 0.612500011920929, "reward_std": 0.4397645592689514, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.7187952995300293, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 468.5625, "completions/min_length": 352.0, "epoch": 0.48676470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 1.0515539646148682, "kl": 0.0030760058434680104, "learning_rate": 2.4338235294117643e-07, "loss": 3.0666589736938477e-05, "reward": 0.25, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.0625, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/mean_length": 388.375, "completions/min_length": 360.0, "epoch": 0.48823529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.06906774640083313, "kl": 0.0033141363237518817, "learning_rate": 2.441176470588235e-07, "loss": 3.333887434564531e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 450.6875, "completions/min_length": 397.0, "epoch": 0.48970588235294116, "frac_reward_zero_std": 0.5, "grad_norm": 1.1121245622634888, "kl": 0.002910578390583396, "learning_rate": 2.448529411764706e-07, "loss": 2.89231538772583e-05, "reward": 0.7875000238418579, "reward_std": 0.13562026619911194, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.3162277936935425, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 506.375, "completions/min_length": 439.0, "epoch": 0.49117647058823527, "frac_reward_zero_std": 0.0, "grad_norm": 1.536301612854004, "kl": 0.0033873170614242554, "learning_rate": 2.455882352941176e-07, "loss": 3.412365913391113e-05, "reward": 0.49068915843963623, "reward_std": 0.41763776540756226, "rewards/DrugCombAccuracyCOTORM/mean": 0.4190905690193176, "rewards/DrugCombAccuracyCOTORM/std": 0.4684404134750366, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5541666746139526, "rewards/DrugCombCoverageCOTORM/std": 0.4107581377029419, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 407.75, "completions/min_length": 378.0, "epoch": 0.49264705882352944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0061952583491802216, "kl": 0.0018864216981455684, "learning_rate": 2.4632352941176473e-07, "loss": 1.8883391021518037e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/mean_length": 518.5, "completions/min_length": 395.0, "epoch": 0.49411764705882355, "frac_reward_zero_std": 0.5, "grad_norm": 0.9220758676528931, "kl": 0.002164083591196686, "learning_rate": 2.4705882352941175e-07, "loss": 2.1738806026405655e-05, "reward": 0.8308262825012207, "reward_std": 0.11592445522546768, "rewards/DrugCombAccuracyCOTORM/mean": 0.8022046685218811, "rewards/DrugCombAccuracyCOTORM/std": 0.26701250672340393, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.890625, "rewards/DrugCombCoverageCOTORM/std": 0.27716949582099915, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 453.6875, "completions/min_length": 372.0, "epoch": 0.49558823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 0.8479742407798767, "kl": 0.0023296346480492502, "learning_rate": 2.477941176470588e-07, "loss": 2.3559408873552456e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 401.8125, "completions/min_length": 333.0, "epoch": 0.4970588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9711560606956482, "kl": 0.0024827977758832276, "learning_rate": 2.485294117647059e-07, "loss": 2.4920425857999362e-05, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 493.4375, "completions/min_length": 393.0, "epoch": 0.4985294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9915520548820496, "kl": 0.0022891754342708737, "learning_rate": 2.492647058823529e-07, "loss": 2.2739171981811523e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 430.625, "completions/min_length": 374.0, "epoch": 0.5, "frac_reward_zero_std": 0.5, "grad_norm": 0.8894878029823303, "kl": 0.002621842606458813, "learning_rate": 2.5e-07, "loss": 2.6146755772060715e-05, "reward": 0.7256875038146973, "reward_std": 0.11083897948265076, "rewards/DrugCombAccuracyCOTORM/mean": 0.6707812547683716, "rewards/DrugCombAccuracyCOTORM/std": 0.38554149866104126, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.890625, "rewards/DrugCombCoverageCOTORM/std": 0.1280868947505951, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 449.5625, "completions/min_length": 396.0, "epoch": 0.5014705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9689577221870422, "kl": 0.0024433861253783107, "learning_rate": 2.5073529411764706e-07, "loss": 2.4527311325073242e-05, "reward": 0.675000011920929, "reward_std": 0.20528724789619446, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 458.6875, "completions/min_length": 433.0, "epoch": 0.5029411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.004629035945981741, "kl": 0.002267407573526725, "learning_rate": 2.514705882352941e-07, "loss": 2.2594200345338322e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/mean_length": 524.75, "completions/min_length": 435.0, "epoch": 0.5044117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.013333200477063656, "kl": 0.0023615438840352, "learning_rate": 2.5220588235294116e-07, "loss": 2.3764498109812848e-05, "reward": 0.800000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/mean_length": 562.0625, "completions/min_length": 491.0, "epoch": 0.5058823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 1.4636938571929932, "kl": 0.0020907406869810075, "learning_rate": 2.5294117647058823e-07, "loss": 2.0928680896759033e-05, "reward": 0.5927083492279053, "reward_std": 0.35278019309043884, "rewards/DrugCombAccuracyCOTORM/mean": 0.5416666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.59375, "rewards/DrugCombCoverageCOTORM/std": 0.4905354380607605, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 421.75, "completions/min_length": 375.0, "epoch": 0.5073529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.0758947134017944, "kl": 0.002115235576638952, "learning_rate": 2.5367647058823525e-07, "loss": 2.1342188119888306e-05, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/mean_length": 493.9375, "completions/min_length": 385.0, "epoch": 0.5088235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.013252097181975842, "kl": 0.0024592694244347513, "learning_rate": 2.544117647058823e-07, "loss": 2.416867391730193e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/mean_length": 484.5, "completions/min_length": 429.0, "epoch": 0.5102941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.007423479110002518, "kl": 0.002428641339065507, "learning_rate": 2.551470588235294e-07, "loss": 2.3988615794223733e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/mean_length": 529.5625, "completions/min_length": 496.0, "epoch": 0.5117647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.6600476503372192, "kl": 0.0032586331362836063, "learning_rate": 2.558823529411764e-07, "loss": 3.239441502955742e-05, "reward": 0.8767499923706055, "reward_std": 0.17010116577148438, "rewards/DrugCombAccuracyCOTORM/mean": 0.8537499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.31442803144454956, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.13437095284461975, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 507.375, "completions/min_length": 457.0, "epoch": 0.513235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0656658411026, "kl": 0.002427022613119334, "learning_rate": 2.566176470588235e-07, "loss": 2.421066164970398e-05, "reward": 0.7145833969116211, "reward_std": 0.1634056121110916, "rewards/DrugCombAccuracyCOTORM/mean": 0.6666666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.42163705825805664, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 476.3125, "completions/min_length": 402.0, "epoch": 0.5147058823529411, "frac_reward_zero_std": 0.0, "grad_norm": 1.4947465658187866, "kl": 0.0021527789358515292, "learning_rate": 2.5735294117647057e-07, "loss": 2.1591782569885254e-05, "reward": 0.8937499523162842, "reward_std": 0.3005203604698181, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/mean_length": 493.8125, "completions/min_length": 386.0, "epoch": 0.5161764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.7846875786781311, "kl": 0.002293680008733645, "learning_rate": 2.5808823529411764e-07, "loss": 2.2903084754943848e-05, "reward": 0.8500000238418579, "reward_std": 0.09258200973272324, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 470.4375, "completions/min_length": 365.0, "epoch": 0.5176470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.0687869787216187, "kl": 0.0027622790075838566, "learning_rate": 2.588235294117647e-07, "loss": 2.7791855245595798e-05, "reward": 0.42500001192092896, "reward_std": 0.1035098284482956, "rewards/DrugCombAccuracyCOTORM/mean": 0.34375, "rewards/DrugCombAccuracyCOTORM/std": 0.3966001570224762, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/mean_length": 524.0625, "completions/min_length": 453.0, "epoch": 0.5191176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 2.099558115005493, "kl": 0.0025179043295793235, "learning_rate": 2.595588235294118e-07, "loss": 2.516806125640869e-05, "reward": 0.6592833399772644, "reward_std": 0.35020512342453003, "rewards/DrugCombAccuracyCOTORM/mean": 0.5897291898727417, "rewards/DrugCombAccuracyCOTORM/std": 0.44843000173568726, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.22360678017139435, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 458.1875, "completions/min_length": 412.0, "epoch": 0.5205882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.07784565538167953, "kl": 0.003797019278863445, "learning_rate": 2.602941176470588e-07, "loss": 3.8645226595690474e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 479.25, "completions/min_length": 443.0, "epoch": 0.5220588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.0621834993362427, "kl": 0.002598342136479914, "learning_rate": 2.610294117647059e-07, "loss": 2.5838613510131836e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 455.0, "completions/min_length": 398.0, "epoch": 0.5235294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.1398812532424927, "kl": 0.0029583930736407638, "learning_rate": 2.6176470588235295e-07, "loss": 2.979743658215739e-05, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 434.25, "completions/min_length": 393.0, "epoch": 0.525, "frac_reward_zero_std": 1.0, "grad_norm": 0.013925647363066673, "kl": 0.002342238265555352, "learning_rate": 2.625e-07, "loss": 2.337879777769558e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 437.0625, "completions/min_length": 385.0, "epoch": 0.5264705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.8848835825920105, "kl": 0.0021069652284495533, "learning_rate": 2.6323529411764705e-07, "loss": 2.117455005645752e-05, "reward": 0.8500000238418579, "reward_std": 0.20701967179775238, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 497.8125, "completions/min_length": 435.0, "epoch": 0.5279411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9634200930595398, "kl": 0.00199420124408789, "learning_rate": 2.639705882352941e-07, "loss": 2.001841858145781e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 456.9375, "completions/min_length": 395.0, "epoch": 0.5294117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.0129377841949463, "kl": 0.0020583842415362597, "learning_rate": 2.6470588235294114e-07, "loss": 2.0689351003966294e-05, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 439.875, "completions/min_length": 378.0, "epoch": 0.5308823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.8020119667053223, "kl": 0.0021112597023602575, "learning_rate": 2.654411764705882e-07, "loss": 2.1184372599236667e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 466.5625, "completions/min_length": 425.0, "epoch": 0.5323529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.006477246526628733, "kl": 0.0019112017180304974, "learning_rate": 2.661764705882353e-07, "loss": 1.904764212667942e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 486.6875, "completions/min_length": 426.0, "epoch": 0.5338235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.4794074296951294, "kl": 0.0028060839395038784, "learning_rate": 2.669117647058823e-07, "loss": 2.804398536682129e-05, "reward": 0.887499988079071, "reward_std": 0.3181980550289154, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 438.5625, "completions/min_length": 367.0, "epoch": 0.5352941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9754598736763, "kl": 0.0021780396346002817, "learning_rate": 2.676470588235294e-07, "loss": 2.171651976823341e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 470.8125, "completions/min_length": 412.0, "epoch": 0.5367647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.9875452518463135, "kl": 0.0020884792029391974, "learning_rate": 2.6838235294117646e-07, "loss": 2.0872801542282104e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 467.625, "completions/min_length": 413.0, "epoch": 0.538235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8709295988082886, "kl": 0.0021805522846989334, "learning_rate": 2.6911764705882353e-07, "loss": 2.19081666728016e-05, "reward": 0.6678333282470703, "reward_std": 0.21822859346866608, "rewards/DrugCombAccuracyCOTORM/mean": 0.6525000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.46795299649238586, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4583333432674408, "rewards/DrugCombCoverageCOTORM/std": 0.8766518831253052, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 420.125, "completions/min_length": 326.0, "epoch": 0.5397058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 1.8504785299301147, "kl": 0.0028973471489734948, "learning_rate": 2.6985294117647055e-07, "loss": 2.9125860237400047e-05, "reward": 0.9178333282470703, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.9025000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.26642072200775146, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 452.8125, "completions/min_length": 410.0, "epoch": 0.5411764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.009286975488066673, "kl": 0.0024714474857319146, "learning_rate": 2.705882352941176e-07, "loss": 2.4635610316181555e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 458.9375, "completions/min_length": 327.0, "epoch": 0.5426470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.009611071087419987, "kl": 0.002721042139455676, "learning_rate": 2.713235294117647e-07, "loss": 2.7444137231213972e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/mean_length": 552.0625, "completions/min_length": 503.0, "epoch": 0.5441176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 1.3967373371124268, "kl": 0.002700531273148954, "learning_rate": 2.720588235294117e-07, "loss": 2.6702880859375e-05, "reward": 0.6573125123977661, "reward_std": 0.24293625354766846, "rewards/DrugCombAccuracyCOTORM/mean": 0.5774999856948853, "rewards/DrugCombAccuracyCOTORM/std": 0.4716236889362335, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.953125, "rewards/DrugCombCoverageCOTORM/std": 0.08589804172515869, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/mean_length": 548.875, "completions/min_length": 434.0, "epoch": 0.5455882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 1.5062144994735718, "kl": 0.0029409131966531277, "learning_rate": 2.727941176470588e-07, "loss": 2.9109418392181396e-05, "reward": 0.6097306609153748, "reward_std": 0.38316214084625244, "rewards/DrugCombAccuracyCOTORM/mean": 0.538856029510498, "rewards/DrugCombAccuracyCOTORM/std": 0.45593684911727905, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8020833730697632, "rewards/DrugCombCoverageCOTORM/std": 0.54081130027771, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 475.5, "completions/min_length": 431.0, "epoch": 0.5470588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.8129962086677551, "kl": 0.002618028811411932, "learning_rate": 2.735294117647059e-07, "loss": 2.607970600365661e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/mean_length": 478.0, "completions/min_length": 401.0, "epoch": 0.5485294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.491994023323059, "kl": 0.002475322486134246, "learning_rate": 2.7426470588235294e-07, "loss": 2.4765729904174805e-05, "reward": 0.84375, "reward_std": 0.22469735145568848, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 478.5, "completions/min_length": 420.0, "epoch": 0.55, "frac_reward_zero_std": 0.5, "grad_norm": 0.9132604598999023, "kl": 0.0032001897343434393, "learning_rate": 2.75e-07, "loss": 3.1899591704132035e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 495.375, "completions/min_length": 440.0, "epoch": 0.5514705882352942, "frac_reward_zero_std": 0.0, "grad_norm": 1.6465115547180176, "kl": 0.0026015910552814603, "learning_rate": 2.757352941176471e-07, "loss": 2.5935471057891846e-05, "reward": 0.887499988079071, "reward_std": 0.3181980550289154, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 465.4375, "completions/min_length": 420.0, "epoch": 0.5529411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.9664167165756226, "kl": 0.0020052432373631746, "learning_rate": 2.764705882352941e-07, "loss": 2.0153820514678955e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/mean_length": 541.5625, "completions/min_length": 449.0, "epoch": 0.5544117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 1.4324218034744263, "kl": 0.003029226849321276, "learning_rate": 2.772058823529412e-07, "loss": 2.9958784580230713e-05, "reward": 0.5969895720481873, "reward_std": 0.15592458844184875, "rewards/DrugCombAccuracyCOTORM/mean": 0.5258593559265137, "rewards/DrugCombAccuracyCOTORM/std": 0.43434572219848633, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7630208134651184, "rewards/DrugCombCoverageCOTORM/std": 0.25217515230178833, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 486.6875, "completions/min_length": 410.0, "epoch": 0.5558823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.971061646938324, "kl": 0.00269422322162427, "learning_rate": 2.7794117647058826e-07, "loss": 2.7125166525365785e-05, "reward": 0.6163333058357239, "reward_std": 0.045761626213788986, "rewards/DrugCombAccuracyCOTORM/mean": 0.5412499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.4801371693611145, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.18257418274879456, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 443.1875, "completions/min_length": 383.0, "epoch": 0.5573529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.004674090072512627, "kl": 0.001959218643605709, "learning_rate": 2.786764705882353e-07, "loss": 1.9663970306282863e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 471.4375, "completions/min_length": 416.0, "epoch": 0.5588235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.008597432635724545, "kl": 0.002110583387548104, "learning_rate": 2.7941176470588235e-07, "loss": 2.1086327251396142e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 439.625, "completions/min_length": 369.0, "epoch": 0.5602941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.6069654822349548, "kl": 0.005277064745314419, "learning_rate": 2.801470588235294e-07, "loss": 5.2898976719006896e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/mean_length": 535.375, "completions/min_length": 420.0, "epoch": 0.5617647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.716174304485321, "kl": 0.0022303966688923538, "learning_rate": 2.8088235294117644e-07, "loss": 2.2324569727061316e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 785.0, "completions/mean_length": 549.125, "completions/min_length": 423.0, "epoch": 0.5632352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.0264058113098145, "kl": 0.0021440388227347285, "learning_rate": 2.816176470588235e-07, "loss": 2.1860003471374512e-05, "reward": 0.5658978223800659, "reward_std": 0.030886253342032433, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6746031641960144, "rewards/DrugCombCoverageCOTORM/std": 0.5250986218452454, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 462.8125, "completions/min_length": 385.0, "epoch": 0.5647058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.02495858445763588, "kl": 0.002725477155763656, "learning_rate": 2.823529411764706e-07, "loss": 2.7541724193724804e-05, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 435.5625, "completions/min_length": 389.0, "epoch": 0.5661764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.01477675698697567, "kl": 0.0020571686618495733, "learning_rate": 2.830882352941176e-07, "loss": 2.0812021830352023e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 441.3125, "completions/min_length": 385.0, "epoch": 0.5676470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.0437712669372559, "kl": 0.0025565001415088773, "learning_rate": 2.838235294117647e-07, "loss": 2.561178735049907e-05, "reward": 0.824999988079071, "reward_std": 0.24348656833171844, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 484.75, "completions/min_length": 395.0, "epoch": 0.5691176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.010064518079161644, "kl": 0.0019719194096978754, "learning_rate": 2.8455882352941176e-07, "loss": 1.9684732251334935e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 469.25, "completions/min_length": 397.0, "epoch": 0.5705882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.008192349225282669, "kl": 0.001855799462646246, "learning_rate": 2.852941176470588e-07, "loss": 1.8396722225588746e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 441.375, "completions/min_length": 388.0, "epoch": 0.5720588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.8394162654876709, "kl": 0.002424969570711255, "learning_rate": 2.8602941176470585e-07, "loss": 2.4437904357910156e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/mean_length": 508.375, "completions/min_length": 428.0, "epoch": 0.5735294117647058, "frac_reward_zero_std": 0.0, "grad_norm": 2.137739896774292, "kl": 0.003558976692147553, "learning_rate": 2.8676470588235293e-07, "loss": 3.516674041748047e-05, "reward": 0.42028769850730896, "reward_std": 0.35048893094062805, "rewards/DrugCombAccuracyCOTORM/mean": 0.3139880895614624, "rewards/DrugCombAccuracyCOTORM/std": 0.41030001640319824, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6909722089767456, "rewards/DrugCombCoverageCOTORM/std": 0.5370888710021973, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 476.1875, "completions/min_length": 411.0, "epoch": 0.575, "frac_reward_zero_std": 0.5, "grad_norm": 0.9969527125358582, "kl": 0.001908525446197018, "learning_rate": 2.8749999999999995e-07, "loss": 1.8993743651662953e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 426.4375, "completions/min_length": 363.0, "epoch": 0.5764705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.1727944612503052, "kl": 0.0020588893676176667, "learning_rate": 2.88235294117647e-07, "loss": 2.063065767288208e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 457.6875, "completions/min_length": 402.0, "epoch": 0.5779411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.005487978924065828, "kl": 0.0021351290051825345, "learning_rate": 2.889705882352941e-07, "loss": 2.1538231521844864e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 505.0625, "completions/min_length": 438.0, "epoch": 0.5794117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.27031409740448, "kl": 0.002731306536588818, "learning_rate": 2.8970588235294117e-07, "loss": 2.70081618509721e-05, "reward": 0.5598958134651184, "reward_std": 0.011342762038111687, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5989583134651184, "rewards/DrugCombCoverageCOTORM/std": 0.4422362744808197, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 438.4375, "completions/min_length": 378.0, "epoch": 0.5808823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.007157233078032732, "kl": 0.0019300069543533027, "learning_rate": 2.9044117647058824e-07, "loss": 1.9202223484171554e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/mean_length": 484.125, "completions/min_length": 408.0, "epoch": 0.5823529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.0314337015151978, "kl": 0.002931323368102312, "learning_rate": 2.911764705882353e-07, "loss": 2.925097942352295e-05, "reward": 0.6291666626930237, "reward_std": 0.17130452394485474, "rewards/DrugCombAccuracyCOTORM/mean": 0.59375, "rewards/DrugCombAccuracyCOTORM/std": 0.4905354380607605, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5416666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.6763190627098083, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 477.8125, "completions/min_length": 414.0, "epoch": 0.5838235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.2517114877700806, "kl": 0.0031795738614164293, "learning_rate": 2.9191176470588234e-07, "loss": 3.2069976441562176e-05, "reward": 0.4749999940395355, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 493.375, "completions/min_length": 419.0, "epoch": 0.5852941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 0.9331080913543701, "kl": 0.0021297618513926864, "learning_rate": 2.926470588235294e-07, "loss": 2.1301908418536186e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 437.25, "completions/min_length": 369.0, "epoch": 0.586764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.1126474142074585, "kl": 0.0022018298332113773, "learning_rate": 2.933823529411765e-07, "loss": 2.2030650143278763e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 427.9375, "completions/min_length": 361.0, "epoch": 0.5882352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.1365875005722046, "kl": 0.002309428935404867, "learning_rate": 2.941176470588235e-07, "loss": 2.2609683583141305e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/mean_length": 519.125, "completions/min_length": 421.0, "epoch": 0.5897058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.5268237590789795, "kl": 0.0036373738548718393, "learning_rate": 2.948529411764706e-07, "loss": 3.619864583015442e-05, "reward": 0.6702113151550293, "reward_std": 0.2740660309791565, "rewards/DrugCombAccuracyCOTORM/mean": 0.6118526458740234, "rewards/DrugCombAccuracyCOTORM/std": 0.4112505614757538, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8072916269302368, "rewards/DrugCombCoverageCOTORM/std": 0.2716866731643677, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 474.3125, "completions/min_length": 389.0, "epoch": 0.5911764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9552667737007141, "kl": 0.0022754475940018892, "learning_rate": 2.9558823529411765e-07, "loss": 2.305302768945694e-05, "reward": 0.7994500398635864, "reward_std": 0.19305697083473206, "rewards/DrugCombAccuracyCOTORM/mean": 0.7664999961853027, "rewards/DrugCombAccuracyCOTORM/std": 0.421848326921463, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.862500011920929, "rewards/DrugCombCoverageCOTORM/std": 0.49916598200798035, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 484.4375, "completions/min_length": 433.0, "epoch": 0.5926470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.0055655213072896, "kl": 0.002005594113143161, "learning_rate": 2.9632352941176467e-07, "loss": 1.9961300495197065e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 502.9375, "completions/min_length": 433.0, "epoch": 0.5941176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9117209315299988, "kl": 0.002317458391189575, "learning_rate": 2.9705882352941175e-07, "loss": 2.3326188966166228e-05, "reward": 0.5375000238418579, "reward_std": 0.0353553406894207, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.6191391944885254, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/mean_length": 515.25, "completions/min_length": 428.0, "epoch": 0.5955882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.0280568599700928, "kl": 0.002073273732094094, "learning_rate": 2.977941176470588e-07, "loss": 2.078711986541748e-05, "reward": 0.5390630960464478, "reward_std": 0.08503010869026184, "rewards/DrugCombAccuracyCOTORM/mean": 0.4287116527557373, "rewards/DrugCombAccuracyCOTORM/std": 0.46367529034614563, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9609375, "rewards/DrugCombCoverageCOTORM/std": 0.10673907399177551, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 406.875, "completions/min_length": 372.0, "epoch": 0.5970588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.00520480377599597, "kl": 0.001962809357792139, "learning_rate": 2.9852941176470584e-07, "loss": 1.9627948859124444e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 422.125, "completions/min_length": 366.0, "epoch": 0.5985294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.3810986280441284, "kl": 0.002666723681613803, "learning_rate": 2.992647058823529e-07, "loss": 2.6494264602661133e-05, "reward": 0.6089166402816772, "reward_std": 0.2576225996017456, "rewards/DrugCombAccuracyCOTORM/mean": 0.5137500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.5050000548362732, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/mean_length": 391.0, "completions/min_length": 369.0, "epoch": 0.6, "frac_reward_zero_std": 0.5, "grad_norm": 1.499966025352478, "kl": 0.0025979304627981037, "learning_rate": 3e-07, "loss": 2.6263296604156494e-05, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 439.0, "completions/min_length": 379.0, "epoch": 0.6014705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.3055239915847778, "kl": 0.0026401618379168212, "learning_rate": 3.00735294117647e-07, "loss": 2.619624137878418e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 426.1875, "completions/min_length": 327.0, "epoch": 0.6029411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.007282329723238945, "kl": 0.0024433385115116835, "learning_rate": 3.014705882352941e-07, "loss": 2.4455981474602595e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/mean_length": 504.1875, "completions/min_length": 399.0, "epoch": 0.6044117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.8598785996437073, "kl": 0.0019540457578841597, "learning_rate": 3.0220588235294115e-07, "loss": 1.956052437890321e-05, "reward": 0.6362916827201843, "reward_std": 0.10264834761619568, "rewards/DrugCombAccuracyCOTORM/mean": 0.54666668176651, "rewards/DrugCombAccuracyCOTORM/std": 0.5017303824424744, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9895833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.041666675359010696, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 466.875, "completions/min_length": 400.0, "epoch": 0.6058823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.007165136747062206, "kl": 0.0021504430333152413, "learning_rate": 3.029411764705882e-07, "loss": 2.1660722268279642e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 439.1875, "completions/min_length": 355.0, "epoch": 0.6073529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.008785675279796124, "kl": 0.002716698916628957, "learning_rate": 3.0367647058823525e-07, "loss": 2.738892908382695e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 428.0625, "completions/min_length": 363.0, "epoch": 0.6088235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.117856502532959, "kl": 0.0022873267298564315, "learning_rate": 3.044117647058823e-07, "loss": 2.296359889442101e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 449.375, "completions/min_length": 385.0, "epoch": 0.6102941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 1.1657382249832153, "kl": 0.0029455441690515727, "learning_rate": 3.0514705882352945e-07, "loss": 3.0003435313119553e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 437.375, "completions/min_length": 351.0, "epoch": 0.611764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.029906153678894, "kl": 0.0023743718047626317, "learning_rate": 3.0588235294117647e-07, "loss": 2.374500036239624e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 471.9375, "completions/min_length": 415.0, "epoch": 0.6132352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.94363933801651, "kl": 0.0019189910090062767, "learning_rate": 3.0661764705882354e-07, "loss": 1.9300729036331177e-05, "reward": 0.4267500042915344, "reward_std": 0.17010116577148438, "rewards/DrugCombAccuracyCOTORM/mean": 0.35374999046325684, "rewards/DrugCombAccuracyCOTORM/std": 0.457746297121048, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.4669642150402069, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 421.1875, "completions/min_length": 332.0, "epoch": 0.6147058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.005582171957939863, "kl": 0.002280411368701607, "learning_rate": 3.073529411764706e-07, "loss": 2.2571934096049517e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/mean_length": 540.4375, "completions/min_length": 431.0, "epoch": 0.6161764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.999921441078186, "kl": 0.002716995106311515, "learning_rate": 3.0808823529411764e-07, "loss": 2.727110404521227e-05, "reward": 0.2690013647079468, "reward_std": 0.12841685116291046, "rewards/DrugCombAccuracyCOTORM/mean": 0.16470222175121307, "rewards/DrugCombAccuracyCOTORM/std": 0.27599605917930603, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3723958134651184, "rewards/DrugCombCoverageCOTORM/std": 0.41596296429634094, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 447.0, "completions/min_length": 389.0, "epoch": 0.6176470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.005538220051676035, "kl": 0.0020695273706223816, "learning_rate": 3.088235294117647e-07, "loss": 2.0867657440248877e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 450.9375, "completions/min_length": 391.0, "epoch": 0.6191176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.191253423690796, "kl": 0.002218077628640458, "learning_rate": 3.095588235294118e-07, "loss": 2.2009015083312988e-05, "reward": 0.6081041693687439, "reward_std": 0.03346627950668335, "rewards/DrugCombAccuracyCOTORM/mean": 0.5641666650772095, "rewards/DrugCombAccuracyCOTORM/std": 0.4515184462070465, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5833333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.49441325664520264, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 454.5, "completions/min_length": 371.0, "epoch": 0.6205882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.9705639481544495, "kl": 0.0021190991974435747, "learning_rate": 3.102941176470588e-07, "loss": 2.1472573280334473e-05, "reward": 0.9833333492279053, "reward_std": 0.047140445560216904, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 436.9375, "completions/min_length": 403.0, "epoch": 0.6220588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.867694616317749, "kl": 0.002243669208837673, "learning_rate": 3.110294117647059e-07, "loss": 2.2375192202161998e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 454.0625, "completions/min_length": 418.0, "epoch": 0.6235294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.01256725937128067, "kl": 0.002197004680056125, "learning_rate": 3.1176470588235295e-07, "loss": 2.208872501796577e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 473.0625, "completions/min_length": 425.0, "epoch": 0.625, "frac_reward_zero_std": 0.0, "grad_norm": 1.5184099674224854, "kl": 0.002435155911371112, "learning_rate": 3.1249999999999997e-07, "loss": 2.4840235710144043e-05, "reward": 0.9026666879653931, "reward_std": 0.2753002643585205, "rewards/DrugCombAccuracyCOTORM/mean": 0.8887500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.30663496255874634, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 445.5, "completions/min_length": 394.0, "epoch": 0.6264705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.1238179206848145, "kl": 0.0028697774396277964, "learning_rate": 3.1323529411764705e-07, "loss": 2.8364551326376386e-05, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 443.5625, "completions/min_length": 400.0, "epoch": 0.6279411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.007442132104188204, "kl": 0.001777752477210015, "learning_rate": 3.139705882352941e-07, "loss": 1.7709218809613958e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 420.6875, "completions/min_length": 356.0, "epoch": 0.6294117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.006825589574873447, "kl": 0.0021725859260186553, "learning_rate": 3.1470588235294114e-07, "loss": 2.1703652237192728e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 455.9375, "completions/min_length": 402.0, "epoch": 0.6308823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.008380988612771034, "kl": 0.002420387521851808, "learning_rate": 3.154411764705882e-07, "loss": 2.42231380980229e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/mean_length": 540.4375, "completions/min_length": 493.0, "epoch": 0.6323529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 2.0142157077789307, "kl": 0.02934084419393912, "learning_rate": 3.161764705882353e-07, "loss": 0.0003007426857948303, "reward": 0.9270833730697632, "reward_std": 0.2062394767999649, "rewards/DrugCombAccuracyCOTORM/mean": 0.9166666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/mean_length": 404.0, "completions/min_length": 349.0, "epoch": 0.6338235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0805249214172363, "kl": 0.0037884447374381125, "learning_rate": 3.169117647058823e-07, "loss": 3.819167613983154e-05, "reward": 0.6609375476837158, "reward_std": 0.2093183547258377, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 486.3125, "completions/min_length": 413.0, "epoch": 0.6352941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.601238489151001, "kl": 0.0033708448172546923, "learning_rate": 3.176470588235294e-07, "loss": 3.392994403839111e-05, "reward": 0.41458338499069214, "reward_std": 0.2533799409866333, "rewards/DrugCombAccuracyCOTORM/mean": 0.3541666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.37453675270080566, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3125, "rewards/DrugCombCoverageCOTORM/std": 0.3095695972442627, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 475.75, "completions/min_length": 396.0, "epoch": 0.6367647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.1395070552825928, "kl": 0.00243925373069942, "learning_rate": 3.1838235294117646e-07, "loss": 2.4429049517493695e-05, "reward": 0.737333357334137, "reward_std": 0.19210051000118256, "rewards/DrugCombAccuracyCOTORM/mean": 0.67166668176651, "rewards/DrugCombAccuracyCOTORM/std": 0.4718286097049713, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 431.875, "completions/min_length": 337.0, "epoch": 0.638235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.006137566175311804, "kl": 0.001984433562029153, "learning_rate": 3.191176470588235e-07, "loss": 1.9746428733924404e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 419.5625, "completions/min_length": 372.0, "epoch": 0.6397058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.016996629536151886, "kl": 0.0030538318096660078, "learning_rate": 3.1985294117647055e-07, "loss": 3.017155540874228e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 424.5, "completions/min_length": 365.0, "epoch": 0.6411764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.018741020932793617, "kl": 0.0025028324162121862, "learning_rate": 3.205882352941177e-07, "loss": 2.5146688130917028e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 425.125, "completions/min_length": 328.0, "epoch": 0.6426470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.11507425457239151, "kl": 0.004673488583648577, "learning_rate": 3.213235294117647e-07, "loss": 4.589118543663062e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/mean_length": 411.9375, "completions/min_length": 356.0, "epoch": 0.6441176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.005492673255503178, "kl": 0.0018510147056076676, "learning_rate": 3.2205882352941177e-07, "loss": 1.8376278603682294e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 471.0, "completions/min_length": 418.0, "epoch": 0.6455882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.008793056011199951, "kl": 0.0022437842562794685, "learning_rate": 3.2279411764705884e-07, "loss": 2.229724486824125e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 465.4375, "completions/min_length": 384.0, "epoch": 0.6470588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.0180097818374634, "kl": 0.0023375586606562138, "learning_rate": 3.2352941176470586e-07, "loss": 2.326071262359619e-05, "reward": 0.6937500238418579, "reward_std": 0.2541618049144745, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.8920949101448059, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 434.5625, "completions/min_length": 380.0, "epoch": 0.6485294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.0516183376312256, "kl": 0.002197250461904332, "learning_rate": 3.2426470588235294e-07, "loss": 2.188417420256883e-05, "reward": 0.5943333506584167, "reward_std": 0.03111269511282444, "rewards/DrugCombAccuracyCOTORM/mean": 0.5137500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.5050000548362732, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 420.875, "completions/min_length": 341.0, "epoch": 0.65, "frac_reward_zero_std": 1.0, "grad_norm": 0.005389060825109482, "kl": 0.0023428095155395567, "learning_rate": 3.25e-07, "loss": 2.351741204620339e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/mean_length": 418.375, "completions/min_length": 392.0, "epoch": 0.6514705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.005996128544211388, "kl": 0.0025085133966058493, "learning_rate": 3.2573529411764703e-07, "loss": 2.5079305487452075e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 435.375, "completions/min_length": 390.0, "epoch": 0.6529411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.004911895841360092, "kl": 0.0020246450731065124, "learning_rate": 3.264705882352941e-07, "loss": 2.017525548581034e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/mean_length": 393.125, "completions/min_length": 342.0, "epoch": 0.6544117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.010017625987529755, "kl": 0.0024848702014423907, "learning_rate": 3.272058823529412e-07, "loss": 2.475920700817369e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/mean_length": 405.6875, "completions/min_length": 354.0, "epoch": 0.6558823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.009701741859316826, "kl": 0.00220111440285109, "learning_rate": 3.279411764705882e-07, "loss": 2.21865666389931e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 470.9375, "completions/min_length": 371.0, "epoch": 0.6573529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.008690685965120792, "kl": 0.002353052725084126, "learning_rate": 3.286764705882353e-07, "loss": 2.3692209651926532e-05, "reward": 0.05000000074505806, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/mean_length": 494.1875, "completions/min_length": 396.0, "epoch": 0.6588235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.430379867553711, "kl": 0.002559034212026745, "learning_rate": 3.2941176470588235e-07, "loss": 2.562999725341797e-05, "reward": 0.8374999761581421, "reward_std": 0.34973084926605225, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 432.125, "completions/min_length": 374.0, "epoch": 0.6602941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.010458147153258324, "kl": 0.002429953688988462, "learning_rate": 3.3014705882352937e-07, "loss": 2.4154378479579464e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 449.625, "completions/min_length": 370.0, "epoch": 0.6617647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.7284381985664368, "kl": 0.0019524797098711133, "learning_rate": 3.3088235294117644e-07, "loss": 1.9378159777261317e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 391.6875, "completions/min_length": 322.0, "epoch": 0.663235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.007488008588552475, "kl": 0.0022889320680405945, "learning_rate": 3.316176470588235e-07, "loss": 2.2603888282901607e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/mean_length": 541.6875, "completions/min_length": 449.0, "epoch": 0.6647058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 0.8227601647377014, "kl": 0.0025725472369231284, "learning_rate": 3.3235294117647054e-07, "loss": 2.5346875190734863e-05, "reward": 0.6017500162124634, "reward_std": 0.05449306592345238, "rewards/DrugCombAccuracyCOTORM/mean": 0.5412499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.47791430354118347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.49916601181030273, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 484.0, "completions/min_length": 390.0, "epoch": 0.6661764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.846712589263916, "kl": 0.002118313714163378, "learning_rate": 3.330882352941176e-07, "loss": 2.126720210071653e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 458.9375, "completions/min_length": 398.0, "epoch": 0.6676470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8988734483718872, "kl": 0.001836075825849548, "learning_rate": 3.338235294117647e-07, "loss": 1.833587884902954e-05, "reward": 0.7565000057220459, "reward_std": 0.0910470187664032, "rewards/DrugCombAccuracyCOTORM/mean": 0.721666693687439, "rewards/DrugCombAccuracyCOTORM/std": 0.3305080235004425, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.22360680997371674, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 460.25, "completions/min_length": 436.0, "epoch": 0.6691176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9932453036308289, "kl": 0.0020586512691807, "learning_rate": 3.345588235294117e-07, "loss": 2.0489096641540527e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 482.375, "completions/min_length": 426.0, "epoch": 0.6705882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.005741601809859276, "kl": 0.0020087355514988303, "learning_rate": 3.352941176470588e-07, "loss": 2.010776734095998e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 422.5, "completions/min_length": 354.0, "epoch": 0.6720588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.010568851605057716, "kl": 0.002419094293145463, "learning_rate": 3.360294117647059e-07, "loss": 2.4108747311402112e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 461.8125, "completions/min_length": 429.0, "epoch": 0.6735294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.004801210016012192, "kl": 0.0016573598550166935, "learning_rate": 3.367647058823529e-07, "loss": 1.6597070498391986e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/mean_length": 534.25, "completions/min_length": 469.0, "epoch": 0.675, "frac_reward_zero_std": 0.5, "grad_norm": 0.9980102181434631, "kl": 0.0020766176749020815, "learning_rate": 3.375e-07, "loss": 2.0623207092285156e-05, "reward": 0.7400000095367432, "reward_std": 0.2579036355018616, "rewards/DrugCombAccuracyCOTORM/mean": 0.737500011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.4425306022167206, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 428.8125, "completions/min_length": 391.0, "epoch": 0.6764705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.005289887543767691, "kl": 0.001956649066414684, "learning_rate": 3.3823529411764707e-07, "loss": 1.9564537069527432e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 481.625, "completions/min_length": 438.0, "epoch": 0.6779411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.049201488494873, "kl": 0.0024240289349108934, "learning_rate": 3.3897058823529415e-07, "loss": 2.4535587726859376e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 433.375, "completions/min_length": 385.0, "epoch": 0.6794117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.009690220467746258, "kl": 0.0024176807492040098, "learning_rate": 3.3970588235294117e-07, "loss": 2.3942338884808123e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/mean_length": 511.8125, "completions/min_length": 406.0, "epoch": 0.6808823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.9384303092956543, "kl": 0.002544850402045995, "learning_rate": 3.4044117647058824e-07, "loss": 2.519923509680666e-05, "reward": 0.8687291741371155, "reward_std": 0.13302607834339142, "rewards/DrugCombAccuracyCOTORM/mean": 0.8463281393051147, "rewards/DrugCombAccuracyCOTORM/std": 0.2788003087043762, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.17213259637355804, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 494.75, "completions/min_length": 439.0, "epoch": 0.6823529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9337291121482849, "kl": 0.0023337056627497077, "learning_rate": 3.411764705882353e-07, "loss": 2.3226502889883704e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 449.375, "completions/min_length": 373.0, "epoch": 0.6838235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.3238624334335327, "kl": 0.002431652304949239, "learning_rate": 3.4191176470588233e-07, "loss": 2.4215285520767793e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 446.3125, "completions/min_length": 404.0, "epoch": 0.6852941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.020223213359713554, "kl": 0.002777946356218308, "learning_rate": 3.426470588235294e-07, "loss": 2.7834448701469228e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 448.875, "completions/min_length": 396.0, "epoch": 0.6867647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.9969921112060547, "kl": 0.002066049666609615, "learning_rate": 3.433823529411765e-07, "loss": 2.0517811208264902e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 436.75, "completions/min_length": 377.0, "epoch": 0.6882352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.019225120544433594, "kl": 0.002516927692340687, "learning_rate": 3.441176470588235e-07, "loss": 2.5107841793214902e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 456.4375, "completions/min_length": 395.0, "epoch": 0.6897058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.006204646546393633, "kl": 0.0019153572793584317, "learning_rate": 3.448529411764706e-07, "loss": 1.9231283658882603e-05, "reward": 0.6713333129882812, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6100000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.40279027819633484, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/mean_length": 483.6875, "completions/min_length": 425.0, "epoch": 0.6911764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.197817087173462, "kl": 0.0024982758332043886, "learning_rate": 3.4558823529411765e-07, "loss": 2.4706125259399414e-05, "reward": 0.9026666879653931, "reward_std": 0.18022631108760834, "rewards/DrugCombAccuracyCOTORM/mean": 0.8887500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.3039928674697876, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.2277100384235382, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/mean_length": 464.0, "completions/min_length": 370.0, "epoch": 0.6926470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8622439503669739, "kl": 0.0020807318505831063, "learning_rate": 3.4632352941176467e-07, "loss": 2.078711986541748e-05, "reward": 0.7479166984558105, "reward_std": 0.2088208645582199, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 454.625, "completions/min_length": 398.0, "epoch": 0.6941176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9954227805137634, "kl": 0.0029654347454197705, "learning_rate": 3.4705882352941174e-07, "loss": 2.9727816581726074e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 474.4375, "completions/min_length": 411.0, "epoch": 0.6955882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.025460992008447647, "kl": 0.0028069465479347855, "learning_rate": 3.477941176470588e-07, "loss": 2.8278080208110623e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 434.375, "completions/min_length": 353.0, "epoch": 0.6970588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.0077323149889707565, "kl": 0.0023473097826354206, "learning_rate": 3.4852941176470584e-07, "loss": 2.3566999516333453e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/mean_length": 473.3125, "completions/min_length": 386.0, "epoch": 0.6985294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.006559798028320074, "kl": 0.00217974663246423, "learning_rate": 3.492647058823529e-07, "loss": 2.197564026573673e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 467.0, "completions/min_length": 432.0, "epoch": 0.7, "frac_reward_zero_std": 1.0, "grad_norm": 0.08854583650827408, "kl": 0.004725767066702247, "learning_rate": 3.5e-07, "loss": 4.656722012441605e-05, "reward": 0.800000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 464.5, "completions/min_length": 417.0, "epoch": 0.7014705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.1313244104385376, "kl": 0.0031038979650475085, "learning_rate": 3.50735294117647e-07, "loss": 3.081560134887695e-05, "reward": 0.9551249742507935, "reward_std": 0.12692566215991974, "rewards/DrugCombAccuracyCOTORM/mean": 0.9478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.20874999463558197, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 457.75, "completions/min_length": 374.0, "epoch": 0.7029411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.3944694995880127, "kl": 0.002441412245389074, "learning_rate": 3.514705882352941e-07, "loss": 2.4624168872833252e-05, "reward": 0.6499999761581421, "reward_std": 0.4851592183113098, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/mean_length": 469.0625, "completions/min_length": 403.0, "epoch": 0.7044117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.1352382898330688, "kl": 0.004062461463036016, "learning_rate": 3.522058823529412e-07, "loss": 3.8273632526397705e-05, "reward": 0.942187488079071, "reward_std": 0.16351844370365143, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 414.625, "completions/min_length": 349.0, "epoch": 0.7058823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.9657707810401917, "kl": 0.002178692870074883, "learning_rate": 3.529411764705882e-07, "loss": 2.1900546926190145e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 456.5, "completions/min_length": 408.0, "epoch": 0.7073529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.005552344955503941, "kl": 0.002180739800678566, "learning_rate": 3.536764705882353e-07, "loss": 2.1701234800275415e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 435.4375, "completions/min_length": 405.0, "epoch": 0.7088235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1394399404525757, "kl": 0.002140967611921951, "learning_rate": 3.5441176470588237e-07, "loss": 2.1264337192405947e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 462.75, "completions/min_length": 408.0, "epoch": 0.7102941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 1.076709508895874, "kl": 0.002519508096156642, "learning_rate": 3.551470588235294e-07, "loss": 2.537667751312256e-05, "reward": 0.9178333282470703, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.9025000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.26642072200775146, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2049.0, "completions/mean_length": 626.8125, "completions/min_length": 451.0, "epoch": 0.711764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9803289175033569, "kl": 0.0022894973517395556, "learning_rate": 3.5588235294117647e-07, "loss": 2.397294701950159e-05, "reward": 0.6477679014205933, "reward_std": 0.20419202744960785, "rewards/DrugCombAccuracyCOTORM/mean": 0.6339285373687744, "rewards/DrugCombAccuracyCOTORM/std": 0.4635525643825531, "rewards/DrugCombCOTFormatORM/mean": 0.9375, "rewards/DrugCombCOTFormatORM/std": 0.25, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.8920949101448059, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 450.125, "completions/min_length": 383.0, "epoch": 0.7132352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.3737539052963257, "kl": 0.0025406202767044306, "learning_rate": 3.5661764705882354e-07, "loss": 2.5175511837005615e-05, "reward": 0.8937499523162842, "reward_std": 0.3005203604698181, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 453.0, "completions/min_length": 414.0, "epoch": 0.7147058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.011060955002903938, "kl": 0.0018743126129265875, "learning_rate": 3.5735294117647056e-07, "loss": 1.87913938134443e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/mean_length": 463.6875, "completions/min_length": 410.0, "epoch": 0.7161764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.166558027267456, "kl": 0.0028820362058468163, "learning_rate": 3.5808823529411763e-07, "loss": 2.8485283110057935e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 471.0, "completions/min_length": 401.0, "epoch": 0.7176470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.5756477117538452, "kl": 0.002444019424729049, "learning_rate": 3.588235294117647e-07, "loss": 2.466142177581787e-05, "reward": 0.2873333692550659, "reward_std": 0.2424648106098175, "rewards/DrugCombAccuracyCOTORM/mean": 0.24979168176651, "rewards/DrugCombAccuracyCOTORM/std": 0.40829405188560486, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.125, "rewards/DrugCombCoverageCOTORM/std": 0.8465616703033447, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 476.6875, "completions/min_length": 399.0, "epoch": 0.7191176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.904744029045105, "kl": 0.0025732313515618443, "learning_rate": 3.5955882352941173e-07, "loss": 2.5972723960876465e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 445.625, "completions/min_length": 342.0, "epoch": 0.7205882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 1.468420386314392, "kl": 0.0025718794495332986, "learning_rate": 3.602941176470588e-07, "loss": 2.5682151317596436e-05, "reward": 0.78125, "reward_std": 0.38548368215560913, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5439056158065796, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 487.75, "completions/min_length": 411.0, "epoch": 0.7220588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.4054597616195679, "kl": 0.001898720336612314, "learning_rate": 3.610294117647059e-07, "loss": 1.897662878036499e-05, "reward": 0.44708335399627686, "reward_std": 0.2846984267234802, "rewards/DrugCombAccuracyCOTORM/mean": 0.3583333492279053, "rewards/DrugCombAccuracyCOTORM/std": 0.4535326361656189, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6041666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.6465721726417542, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 429.3125, "completions/min_length": 382.0, "epoch": 0.7235294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.0683422088623047, "kl": 0.0018442902946844697, "learning_rate": 3.617647058823529e-07, "loss": 1.8493927200324833e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 408.75, "completions/min_length": 357.0, "epoch": 0.725, "frac_reward_zero_std": 1.0, "grad_norm": 0.04206695780158043, "kl": 0.0037340359413065016, "learning_rate": 3.6249999999999997e-07, "loss": 3.702104004332796e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 442.9375, "completions/min_length": 389.0, "epoch": 0.7264705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.0651038885116577, "kl": 0.0035660161229316145, "learning_rate": 3.6323529411764704e-07, "loss": 3.555417060852051e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/mean_length": 456.5, "completions/min_length": 411.0, "epoch": 0.7279411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.8714905381202698, "kl": 0.0023091469483915716, "learning_rate": 3.6397058823529406e-07, "loss": 2.327561378479004e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 435.75, "completions/min_length": 363.0, "epoch": 0.7294117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.01103475783020258, "kl": 0.0028002698672935367, "learning_rate": 3.6470588235294114e-07, "loss": 2.797939487209078e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 466.625, "completions/min_length": 392.0, "epoch": 0.7308823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.007882585749030113, "kl": 0.002103184611769393, "learning_rate": 3.654411764705882e-07, "loss": 2.1158022718736902e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 482.875, "completions/min_length": 429.0, "epoch": 0.7323529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.194072961807251, "kl": 0.002811980899423361, "learning_rate": 3.6617647058823523e-07, "loss": 2.8099555493099615e-05, "reward": 0.8964166641235352, "reward_std": 0.19718943536281586, "rewards/DrugCombAccuracyCOTORM/mean": 0.8887500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.30663496255874634, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5013870000839233, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 481.625, "completions/min_length": 377.0, "epoch": 0.7338235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0321263074874878, "kl": 0.0019173973996657878, "learning_rate": 3.669117647058823e-07, "loss": 1.905113458633423e-05, "reward": 0.7767499685287476, "reward_std": 0.1872129589319229, "rewards/DrugCombAccuracyCOTORM/mean": 0.7287499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.42015671730041504, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.13437095284461975, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 434.625, "completions/min_length": 361.0, "epoch": 0.7352941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.0052766394801437855, "kl": 0.0019682591082528234, "learning_rate": 3.6764705882352943e-07, "loss": 1.9800238078460097e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 439.3125, "completions/min_length": 404.0, "epoch": 0.736764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.0051542953588068485, "kl": 0.0020511132606770843, "learning_rate": 3.6838235294117645e-07, "loss": 2.052675154118333e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 440.625, "completions/min_length": 377.0, "epoch": 0.7382352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.9084182381629944, "kl": 0.002718681702390313, "learning_rate": 3.6911764705882353e-07, "loss": 2.7585774660110474e-05, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 442.75, "completions/min_length": 384.0, "epoch": 0.7397058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.0056256819516420364, "kl": 0.002112714486429468, "learning_rate": 3.698529411764706e-07, "loss": 2.1078274585306644e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 448.75, "completions/min_length": 377.0, "epoch": 0.7411764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.3851293325424194, "kl": 0.0021162372431717813, "learning_rate": 3.705882352941176e-07, "loss": 2.092287650157232e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/mean_length": 482.9375, "completions/min_length": 373.0, "epoch": 0.7426470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.2897626161575317, "kl": 0.0020878326904494315, "learning_rate": 3.713235294117647e-07, "loss": 2.09808349609375e-05, "reward": 0.34037500619888306, "reward_std": 0.25020015239715576, "rewards/DrugCombAccuracyCOTORM/mean": 0.21843749284744263, "rewards/DrugCombAccuracyCOTORM/std": 0.39312729239463806, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.65625, "rewards/DrugCombCoverageCOTORM/std": 0.3966001570224762, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 496.0, "completions/min_length": 422.0, "epoch": 0.7441176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.419432282447815, "kl": 0.0022437101288232952, "learning_rate": 3.7205882352941177e-07, "loss": 2.2429972887039185e-05, "reward": 0.8112916946411133, "reward_std": 0.3500143587589264, "rewards/DrugCombAccuracyCOTORM/mean": 0.7706249952316284, "rewards/DrugCombAccuracyCOTORM/std": 0.413012832403183, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9479166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.145535409450531, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 416.6875, "completions/min_length": 359.0, "epoch": 0.7455882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.4238518476486206, "kl": 0.0026825719978660345, "learning_rate": 3.727941176470588e-07, "loss": 2.6650726795196533e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/mean_length": 480.625, "completions/min_length": 380.0, "epoch": 0.7470588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.0341098308563232, "kl": 0.002671870868653059, "learning_rate": 3.7352941176470586e-07, "loss": 2.645891254360322e-05, "reward": 0.596875011920929, "reward_std": 0.0088388342410326, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/mean_length": 478.375, "completions/min_length": 410.0, "epoch": 0.7485294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.8149634599685669, "kl": 0.0020223186584189534, "learning_rate": 3.7426470588235294e-07, "loss": 2.0347535610198975e-05, "reward": 0.5958333611488342, "reward_std": 0.11082304269075394, "rewards/DrugCombAccuracyCOTORM/mean": 0.5416666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 428.75, "completions/min_length": 375.0, "epoch": 0.75, "frac_reward_zero_std": 0.5, "grad_norm": 0.8242294192314148, "kl": 0.0020916581561323255, "learning_rate": 3.75e-07, "loss": 2.0889254301437177e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 487.8125, "completions/min_length": 434.0, "epoch": 0.7514705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.027386212721467018, "kl": 0.003023135388502851, "learning_rate": 3.7573529411764703e-07, "loss": 2.9519280360545963e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 467.8125, "completions/min_length": 419.0, "epoch": 0.7529411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.0788638591766357, "kl": 0.003284448030171916, "learning_rate": 3.764705882352941e-07, "loss": 3.28943133354187e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 448.4375, "completions/min_length": 404.0, "epoch": 0.7544117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.004045623820275068, "kl": 0.0019764769240282476, "learning_rate": 3.772058823529412e-07, "loss": 1.9700768461916596e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 449.5, "completions/min_length": 394.0, "epoch": 0.7558823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 1.8480219841003418, "kl": 0.002540223766118288, "learning_rate": 3.779411764705882e-07, "loss": 2.5480985641479492e-05, "reward": 0.84375, "reward_std": 0.3442630469799042, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 451.1875, "completions/min_length": 388.0, "epoch": 0.7573529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.2155903577804565, "kl": 0.0028033012058585882, "learning_rate": 3.7867647058823527e-07, "loss": 2.7626752853393555e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 410.9375, "completions/min_length": 351.0, "epoch": 0.7588235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.006523520220071077, "kl": 0.0022493031865451485, "learning_rate": 3.7941176470588235e-07, "loss": 2.245254290755838e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 442.25, "completions/min_length": 375.0, "epoch": 0.7602941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.01084087509661913, "kl": 0.0027691717259585857, "learning_rate": 3.8014705882352937e-07, "loss": 2.7166148356627673e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 483.3125, "completions/min_length": 376.0, "epoch": 0.7617647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.941087007522583, "kl": 0.002871873090043664, "learning_rate": 3.8088235294117644e-07, "loss": 2.847087125701364e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 431.75, "completions/min_length": 361.0, "epoch": 0.763235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.06484920531511307, "kl": 0.004525483236648142, "learning_rate": 3.816176470588235e-07, "loss": 4.331835953053087e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 487.5625, "completions/min_length": 443.0, "epoch": 0.7647058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.011456206440925598, "kl": 0.0023019181680865586, "learning_rate": 3.8235294117647053e-07, "loss": 2.275323640787974e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 454.1875, "completions/min_length": 345.0, "epoch": 0.7661764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.786289393901825, "kl": 0.001986821152968332, "learning_rate": 3.8308823529411766e-07, "loss": 1.9937753677368164e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 456.5, "completions/min_length": 425.0, "epoch": 0.7676470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.006945445667952299, "kl": 0.002110291999997571, "learning_rate": 3.8382352941176473e-07, "loss": 2.120692442986183e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 456.9375, "completions/min_length": 394.0, "epoch": 0.7691176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.5866482257843018, "kl": 0.0025823231553658843, "learning_rate": 3.8455882352941175e-07, "loss": 2.6051577151520178e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 480.5625, "completions/min_length": 421.0, "epoch": 0.7705882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.0824978351593018, "kl": 0.0022644457349088043, "learning_rate": 3.8529411764705883e-07, "loss": 2.278167630720418e-05, "reward": 0.7609595060348511, "reward_std": 0.17036396265029907, "rewards/DrugCombAccuracyCOTORM/mean": 0.742866039276123, "rewards/DrugCombAccuracyCOTORM/std": 0.37329310178756714, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.6776867508888245, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 465.9375, "completions/min_length": 431.0, "epoch": 0.7720588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9671568274497986, "kl": 0.002440479351207614, "learning_rate": 3.860294117647059e-07, "loss": 2.4536710043321364e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 457.875, "completions/min_length": 423.0, "epoch": 0.7735294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.5430476665496826, "kl": 0.002261158195324242, "learning_rate": 3.867647058823529e-07, "loss": 2.2280961275100708e-05, "reward": 0.6369999647140503, "reward_std": 0.36935746669769287, "rewards/DrugCombAccuracyCOTORM/mean": 0.5618749856948853, "rewards/DrugCombAccuracyCOTORM/std": 0.4581407308578491, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.197202667593956, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 499.4375, "completions/min_length": 434.0, "epoch": 0.775, "frac_reward_zero_std": 0.0, "grad_norm": 1.1893644332885742, "kl": 0.0024484755704179406, "learning_rate": 3.875e-07, "loss": 2.514570951461792e-05, "reward": 0.4204167127609253, "reward_std": 0.07894398272037506, "rewards/DrugCombAccuracyCOTORM/mean": 0.32499998807907104, "rewards/DrugCombAccuracyCOTORM/std": 0.35308167338371277, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6041666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.18130187690258026, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 440.625, "completions/min_length": 374.0, "epoch": 0.7764705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 1.7881916761398315, "kl": 0.002165883139241487, "learning_rate": 3.8823529411764707e-07, "loss": 2.1517276763916016e-05, "reward": 0.8312499523162842, "reward_std": 0.36740854382514954, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/mean_length": 522.0, "completions/min_length": 462.0, "epoch": 0.7779411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.8213452696800232, "kl": 0.001924061041790992, "learning_rate": 3.889705882352941e-07, "loss": 1.9289876945549622e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 434.5, "completions/min_length": 399.0, "epoch": 0.7794117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.015252201817929745, "kl": 0.002675430558156222, "learning_rate": 3.8970588235294116e-07, "loss": 2.6852147129829973e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 427.375, "completions/min_length": 376.0, "epoch": 0.7808823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.30982112884521484, "kl": 0.007035367103526369, "learning_rate": 3.9044117647058824e-07, "loss": 7.05936472513713e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 442.5625, "completions/min_length": 389.0, "epoch": 0.7823529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.3731671571731567, "kl": 0.0016728528134990484, "learning_rate": 3.9117647058823526e-07, "loss": 1.664087176322937e-05, "reward": 0.6499999761581421, "reward_std": 0.39218372106552124, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 435.5625, "completions/min_length": 379.0, "epoch": 0.7838235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0078051090240479, "kl": 0.0017251003300771117, "learning_rate": 3.9191176470588233e-07, "loss": 1.718848943710327e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/mean_length": 384.0, "completions/min_length": 330.0, "epoch": 0.7852941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.00685243122279644, "kl": 0.0020272599067538977, "learning_rate": 3.926470588235294e-07, "loss": 2.047326779575087e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 407.625, "completions/min_length": 370.0, "epoch": 0.7867647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.011522065848112106, "kl": 0.002084890496917069, "learning_rate": 3.933823529411764e-07, "loss": 2.0975538063794374e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 426.625, "completions/min_length": 353.0, "epoch": 0.788235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9216789603233337, "kl": 0.0022022232587914914, "learning_rate": 3.941176470588235e-07, "loss": 2.2012918634572998e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 424.9375, "completions/min_length": 376.0, "epoch": 0.7897058823529411, "frac_reward_zero_std": 0.0, "grad_norm": 1.3346741199493408, "kl": 0.002329252049094066, "learning_rate": 3.9485294117647057e-07, "loss": 2.356618642807007e-05, "reward": 0.6499999761581421, "reward_std": 0.3265853524208069, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/mean_length": 457.6875, "completions/min_length": 382.0, "epoch": 0.7911764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.007300826720893383, "kl": 0.002333331067347899, "learning_rate": 3.955882352941176e-07, "loss": 2.3274682462215424e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 416.375, "completions/min_length": 365.0, "epoch": 0.7926470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.009242065250873566, "kl": 0.002511001017410308, "learning_rate": 3.9632352941176467e-07, "loss": 2.5341621949337423e-05, "reward": 0.6713333129882812, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6100000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.40279027819633484, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/mean_length": 494.125, "completions/min_length": 412.0, "epoch": 0.7941176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 1.5979523658752441, "kl": 0.0033110204385593534, "learning_rate": 3.9705882352941174e-07, "loss": 3.36766242980957e-05, "reward": 0.6558583378791809, "reward_std": 0.3331139385700226, "rewards/DrugCombAccuracyCOTORM/mean": 0.6159167289733887, "rewards/DrugCombAccuracyCOTORM/std": 0.3644828796386719, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6312500238418579, "rewards/DrugCombCoverageCOTORM/std": 0.6491554379463196, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 451.1875, "completions/min_length": 410.0, "epoch": 0.7955882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.91175377368927, "kl": 0.00265647895867005, "learning_rate": 3.9779411764705876e-07, "loss": 2.634154589031823e-05, "reward": 0.4000000059604645, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 525.0625, "completions/min_length": 419.0, "epoch": 0.7970588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.6436562538146973, "kl": 0.0027108259964734316, "learning_rate": 3.985294117647059e-07, "loss": 2.7105212211608887e-05, "reward": 0.6989375352859497, "reward_std": 0.3259457051753998, "rewards/DrugCombAccuracyCOTORM/mean": 0.6588281393051147, "rewards/DrugCombAccuracyCOTORM/std": 0.4228012263774872, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.71875, "rewards/DrugCombCoverageCOTORM/std": 0.6741765141487122, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 438.6875, "completions/min_length": 365.0, "epoch": 0.7985294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.1027103662490845, "kl": 0.0026225660694763064, "learning_rate": 3.9926470588235296e-07, "loss": 2.6341527700424194e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 454.875, "completions/min_length": 366.0, "epoch": 0.8, "frac_reward_zero_std": 1.0, "grad_norm": 0.004269069992005825, "kl": 0.0018512785609345883, "learning_rate": 4e-07, "loss": 1.839608739828691e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 458.1875, "completions/min_length": 408.0, "epoch": 0.8014705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.0917704105377197, "kl": 0.002251379279186949, "learning_rate": 4.0073529411764706e-07, "loss": 2.2310763597488403e-05, "reward": 0.6875, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/mean_length": 509.9375, "completions/min_length": 421.0, "epoch": 0.8029411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 1.5258184671401978, "kl": 0.001648926903726533, "learning_rate": 4.0147058823529413e-07, "loss": 1.6316771507263184e-05, "reward": 0.7134499549865723, "reward_std": 0.29707562923431396, "rewards/DrugCombAccuracyCOTORM/mean": 0.6621249914169312, "rewards/DrugCombAccuracyCOTORM/std": 0.41657811403274536, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8374999761581421, "rewards/DrugCombCoverageCOTORM/std": 0.218708336353302, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 426.125, "completions/min_length": 354.0, "epoch": 0.8044117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.009477004408836365, "kl": 0.00243170207249932, "learning_rate": 4.0220588235294115e-07, "loss": 2.406436215096619e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 464.1875, "completions/min_length": 368.0, "epoch": 0.8058823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 16.75006675720215, "kl": 0.06671016232576221, "learning_rate": 4.029411764705882e-07, "loss": 0.0006484503392130136, "reward": 0.5, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 438.125, "completions/min_length": 363.0, "epoch": 0.8073529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.03447556495666504, "kl": 0.003638559195678681, "learning_rate": 4.036764705882353e-07, "loss": 3.608471524785273e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 460.1875, "completions/min_length": 397.0, "epoch": 0.8088235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.2522356510162354, "kl": 0.002373101015109569, "learning_rate": 4.044117647058823e-07, "loss": 2.344630593142938e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 474.6875, "completions/min_length": 401.0, "epoch": 0.8102941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.144240379333496, "kl": 0.0025380976439919323, "learning_rate": 4.051470588235294e-07, "loss": 2.511133425286971e-05, "reward": 0.574999988079071, "reward_std": 0.04629100486636162, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 464.25, "completions/min_length": 408.0, "epoch": 0.8117647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.012748736888170242, "kl": 0.002224120747996494, "learning_rate": 4.0588235294117646e-07, "loss": 2.2386584532796405e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 443.4375, "completions/min_length": 371.0, "epoch": 0.8132352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.00429193489253521, "kl": 0.0021495239343494177, "learning_rate": 4.066176470588235e-07, "loss": 2.154629328288138e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 521.4375, "completions/min_length": 458.0, "epoch": 0.8147058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.5127830505371094, "kl": 0.0019021853513550013, "learning_rate": 4.0735294117647056e-07, "loss": 1.903250813484192e-05, "reward": 0.7833333611488342, "reward_std": 0.25500237941741943, "rewards/DrugCombAccuracyCOTORM/mean": 0.7708333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.3247506320476532, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5577734112739563, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 432.6875, "completions/min_length": 384.0, "epoch": 0.8161764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9149474501609802, "kl": 0.002134271402610466, "learning_rate": 4.0808823529411763e-07, "loss": 2.11372971534729e-05, "reward": 0.762499988079071, "reward_std": 0.25599944591522217, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 484.5, "completions/min_length": 431.0, "epoch": 0.8176470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.006123663391917944, "kl": 0.0020727775990962982, "learning_rate": 4.0882352941176465e-07, "loss": 2.080112972180359e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/mean_length": 494.875, "completions/min_length": 393.0, "epoch": 0.8191176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.007243487983942032, "kl": 0.002055267192190513, "learning_rate": 4.0955882352941173e-07, "loss": 2.0524024876067415e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 464.375, "completions/min_length": 392.0, "epoch": 0.8205882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 1.4227659702301025, "kl": 0.004178354836767539, "learning_rate": 4.102941176470588e-07, "loss": 4.182755947113037e-05, "reward": 0.637333333492279, "reward_std": 0.29900500178337097, "rewards/DrugCombAccuracyCOTORM/mean": 0.54666668176651, "rewards/DrugCombAccuracyCOTORM/std": 0.43019378185272217, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 433.875, "completions/min_length": 384.0, "epoch": 0.8220588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.005758289713412523, "kl": 0.0018888353370130062, "learning_rate": 4.110294117647058e-07, "loss": 1.8834805814549327e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/mean_length": 481.375, "completions/min_length": 332.0, "epoch": 0.8235294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.7633821964263916, "kl": 0.0021780179522465914, "learning_rate": 4.117647058823529e-07, "loss": 2.199411392211914e-05, "reward": 0.9551249742507935, "reward_std": 0.12692566215991974, "rewards/DrugCombAccuracyCOTORM/mean": 0.9478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.20874999463558197, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 420.625, "completions/min_length": 355.0, "epoch": 0.825, "frac_reward_zero_std": 0.5, "grad_norm": 1.3222225904464722, "kl": 0.0025232521002180874, "learning_rate": 4.1249999999999997e-07, "loss": 2.529128141759429e-05, "reward": 0.7927083373069763, "reward_std": 0.2482253462076187, "rewards/DrugCombAccuracyCOTORM/mean": 0.7916666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.4013864994049072, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.59375, "rewards/DrugCombCoverageCOTORM/std": 0.8003905415534973, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/mean_length": 504.375, "completions/min_length": 408.0, "epoch": 0.8264705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 1.4105894565582275, "kl": 0.002371774084167555, "learning_rate": 4.1323529411764704e-07, "loss": 2.3573637008666992e-05, "reward": 0.4125000238418579, "reward_std": 0.4153292179107666, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 436.8125, "completions/min_length": 343.0, "epoch": 0.8279411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9441671371459961, "kl": 0.002422259305603802, "learning_rate": 4.1397058823529406e-07, "loss": 2.3810454877093434e-05, "reward": 0.7124999761581421, "reward_std": 0.24164614081382751, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 464.3125, "completions/min_length": 411.0, "epoch": 0.8294117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.3071694374084473, "kl": 0.0031853051914367825, "learning_rate": 4.147058823529412e-07, "loss": 3.218560232198797e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 474.875, "completions/min_length": 419.0, "epoch": 0.8308823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.0525412559509277, "kl": 0.0026407483383081853, "learning_rate": 4.1544117647058826e-07, "loss": 2.6457310013938695e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 442.8125, "completions/min_length": 386.0, "epoch": 0.8323529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.2106373310089111, "kl": 0.0018432211654726416, "learning_rate": 4.161764705882353e-07, "loss": 1.8320977687835693e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 449.25, "completions/min_length": 373.0, "epoch": 0.8338235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.006245697848498821, "kl": 0.0021844840375706553, "learning_rate": 4.1691176470588236e-07, "loss": 2.153668538085185e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 404.0, "completions/min_length": 373.0, "epoch": 0.8352941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.019765052944421768, "kl": 0.0027229919214732945, "learning_rate": 4.1764705882352943e-07, "loss": 2.7354471967555583e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/mean_length": 485.1875, "completions/min_length": 391.0, "epoch": 0.836764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.4372371435165405, "kl": 0.0024778360966593027, "learning_rate": 4.1838235294117645e-07, "loss": 2.4981796741485596e-05, "reward": 0.5625, "reward_std": 0.3934735357761383, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 440.75, "completions/min_length": 379.0, "epoch": 0.8382352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.6270002126693726, "kl": 0.0018699497450143099, "learning_rate": 4.191176470588235e-07, "loss": 1.882082869997248e-05, "reward": 0.6451666355133057, "reward_std": 0.04926346242427826, "rewards/DrugCombAccuracyCOTORM/mean": 0.5824999809265137, "rewards/DrugCombAccuracyCOTORM/std": 0.43676844239234924, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.2687419056892395, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 472.75, "completions/min_length": 429.0, "epoch": 0.8397058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.006087645888328552, "kl": 0.0018476960540283471, "learning_rate": 4.198529411764706e-07, "loss": 1.8435190213494934e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 458.3125, "completions/min_length": 384.0, "epoch": 0.8411764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.005911875981837511, "kl": 0.0023336937883868814, "learning_rate": 4.205882352941176e-07, "loss": 2.3365237211692147e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 435.0625, "completions/min_length": 367.0, "epoch": 0.8426470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9904442429542542, "kl": 0.0023594751837663352, "learning_rate": 4.213235294117647e-07, "loss": 2.3603439331054688e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 429.875, "completions/min_length": 385.0, "epoch": 0.8441176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.583425760269165, "kl": 0.002096854121191427, "learning_rate": 4.2205882352941177e-07, "loss": 2.09858626476489e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/mean_length": 489.25, "completions/min_length": 388.0, "epoch": 0.8455882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.9868215322494507, "kl": 0.0020351489074528217, "learning_rate": 4.227941176470588e-07, "loss": 2.047011730610393e-05, "reward": 0.893750011920929, "reward_std": 0.06557891517877579, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.1666666567325592, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.08333335071802139, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 483.5625, "completions/min_length": 394.0, "epoch": 0.8470588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.6483488082885742, "kl": 0.0029150297050364316, "learning_rate": 4.2352941176470586e-07, "loss": 2.9459595680236816e-05, "reward": 0.8374999761581421, "reward_std": 0.3619407117366791, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 436.875, "completions/min_length": 346.0, "epoch": 0.8485294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.3749929666519165, "kl": 0.002487745543476194, "learning_rate": 4.2426470588235293e-07, "loss": 2.459809184074402e-05, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/mean_length": 477.9375, "completions/min_length": 371.0, "epoch": 0.85, "frac_reward_zero_std": 1.0, "grad_norm": 0.006025457754731178, "kl": 0.002143673278624192, "learning_rate": 4.2499999999999995e-07, "loss": 2.147431587218307e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 463.75, "completions/min_length": 379.0, "epoch": 0.8514705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9431084990501404, "kl": 0.0019725285237655044, "learning_rate": 4.2573529411764703e-07, "loss": 1.981109380722046e-05, "reward": 0.7124999761581421, "reward_std": 0.2386719137430191, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.6191391944885254, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 495.0, "completions/min_length": 441.0, "epoch": 0.8529411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 2.016601324081421, "kl": 0.004275912593584508, "learning_rate": 4.264705882352941e-07, "loss": 4.427507519721985e-05, "reward": 0.773312509059906, "reward_std": 0.3696135878562927, "rewards/DrugCombAccuracyCOTORM/mean": 0.7654687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.42317667603492737, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.609375, "rewards/DrugCombCoverageCOTORM/std": 0.8008784651756287, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 476.6875, "completions/min_length": 427.0, "epoch": 0.8544117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.0065437364391982555, "kl": 0.002495497406926006, "learning_rate": 4.272058823529411e-07, "loss": 2.4922121156123467e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 448.9375, "completions/min_length": 401.0, "epoch": 0.8558823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.006068414077162743, "kl": 0.0021569394157268107, "learning_rate": 4.279411764705882e-07, "loss": 2.1586623915936798e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/mean_length": 513.0625, "completions/min_length": 431.0, "epoch": 0.8573529411764705, "frac_reward_zero_std": 0.0, "grad_norm": 1.5114144086837769, "kl": 0.002941273240139708, "learning_rate": 4.2867647058823527e-07, "loss": 2.8789043426513672e-05, "reward": 0.8937499523162842, "reward_std": 0.2281055748462677, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.2687419056892395, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 437.0, "completions/min_length": 389.0, "epoch": 0.8588235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.021671045571565628, "kl": 0.002911451563704759, "learning_rate": 4.294117647058823e-07, "loss": 2.937555473181419e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 418.5625, "completions/min_length": 367.0, "epoch": 0.8602941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 1.2478257417678833, "kl": 0.00305819432833232, "learning_rate": 4.301470588235294e-07, "loss": 3.106147050857544e-05, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 438.75, "completions/min_length": 370.0, "epoch": 0.861764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.006260056979954243, "kl": 0.0021775875065941364, "learning_rate": 4.308823529411765e-07, "loss": 2.18165478145238e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 445.875, "completions/min_length": 371.0, "epoch": 0.8632352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.006419322453439236, "kl": 0.002054177748505026, "learning_rate": 4.316176470588235e-07, "loss": 2.042217784037348e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 423.0625, "completions/min_length": 371.0, "epoch": 0.8647058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.07517389953136444, "kl": 0.003816438344074413, "learning_rate": 4.323529411764706e-07, "loss": 3.81890386051964e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 480.6875, "completions/min_length": 428.0, "epoch": 0.8661764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8313159346580505, "kl": 0.0019471953273750842, "learning_rate": 4.3308823529411766e-07, "loss": 1.9229020836064592e-05, "reward": 0.8165000081062317, "reward_std": 0.13981960713863373, "rewards/DrugCombAccuracyCOTORM/mean": 0.8018749952316284, "rewards/DrugCombAccuracyCOTORM/std": 0.2782776653766632, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.5055250525474548, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 443.8125, "completions/min_length": 399.0, "epoch": 0.8676470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.02372327819466591, "kl": 0.0026610875502228737, "learning_rate": 4.338235294117647e-07, "loss": 2.6729348974185996e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 458.1875, "completions/min_length": 416.0, "epoch": 0.8691176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.11769437789917, "kl": 0.0024569847737438977, "learning_rate": 4.3455882352941175e-07, "loss": 2.4755172489676625e-05, "reward": 0.7312500476837158, "reward_std": 0.22350695729255676, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/mean_length": 534.0625, "completions/min_length": 433.0, "epoch": 0.8705882352941177, "frac_reward_zero_std": 0.0, "grad_norm": 1.7707371711730957, "kl": 0.0025787325284909457, "learning_rate": 4.352941176470588e-07, "loss": 2.536177635192871e-05, "reward": 0.5151249766349792, "reward_std": 0.2951925992965698, "rewards/DrugCombAccuracyCOTORM/mean": 0.45249998569488525, "rewards/DrugCombAccuracyCOTORM/std": 0.47584545612335205, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.53125, "rewards/DrugCombCoverageCOTORM/std": 0.4876958429813385, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/mean_length": 524.9375, "completions/min_length": 446.0, "epoch": 0.8720588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.8605828881263733, "kl": 0.0018607291858643293, "learning_rate": 4.3602941176470585e-07, "loss": 1.874603367468808e-05, "reward": 0.9441458582878113, "reward_std": 0.11408175528049469, "rewards/DrugCombAccuracyCOTORM/mean": 0.9321354031562805, "rewards/DrugCombAccuracyCOTORM/std": 0.20061202347278595, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.0625, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 409.3125, "completions/min_length": 353.0, "epoch": 0.8735294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.011950355023145676, "kl": 0.0023145586892496794, "learning_rate": 4.367647058823529e-07, "loss": 2.3191936634248123e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/mean_length": 509.5, "completions/min_length": 415.0, "epoch": 0.875, "frac_reward_zero_std": 0.5, "grad_norm": 0.9361370801925659, "kl": 0.0023773903667461127, "learning_rate": 4.375e-07, "loss": 2.3639855498913676e-05, "reward": 0.9129166603088379, "reward_std": 0.17265836894512177, "rewards/DrugCombAccuracyCOTORM/mean": 0.909375011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.25443974137306213, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5013870000839233, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 446.0, "completions/min_length": 361.0, "epoch": 0.8764705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.007705550640821457, "kl": 0.0021418477699626237, "learning_rate": 4.38235294117647e-07, "loss": 2.135221438948065e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 489.0625, "completions/min_length": 441.0, "epoch": 0.8779411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.010011746548116207, "kl": 0.0025781506265047938, "learning_rate": 4.389705882352941e-07, "loss": 2.5762694349396043e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/mean_length": 390.0, "completions/min_length": 347.0, "epoch": 0.8794117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.007818564772605896, "kl": 0.0023120242985896766, "learning_rate": 4.3970588235294116e-07, "loss": 2.3157572286436334e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 432.0625, "completions/min_length": 368.0, "epoch": 0.8808823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.9118161201477051, "kl": 0.0025384192122146487, "learning_rate": 4.404411764705882e-07, "loss": 2.5834688130998984e-05, "reward": 0.585812509059906, "reward_std": 0.17963135242462158, "rewards/DrugCombAccuracyCOTORM/mean": 0.5779687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.4977610111236572, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.234375, "rewards/DrugCombCoverageCOTORM/std": 0.9893969297409058, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 452.6875, "completions/min_length": 388.0, "epoch": 0.8823529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.01916041038930416, "kl": 0.0029126506415195763, "learning_rate": 4.4117647058823526e-07, "loss": 2.7711375878425315e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 446.9375, "completions/min_length": 391.0, "epoch": 0.8838235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8267856240272522, "kl": 0.0021977972355671227, "learning_rate": 4.4191176470588233e-07, "loss": 2.2097176042734645e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 424.0, "completions/min_length": 385.0, "epoch": 0.8852941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.004928530193865299, "kl": 0.002343770960578695, "learning_rate": 4.4264705882352935e-07, "loss": 2.3657034034840763e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 444.8125, "completions/min_length": 388.0, "epoch": 0.8867647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.1739646196365356, "kl": 0.002608105569379404, "learning_rate": 4.433823529411764e-07, "loss": 2.564435635576956e-05, "reward": 0.887499988079071, "reward_std": 0.12026755511760712, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.22360680997371674, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.22360680997371674, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 409.0625, "completions/min_length": 344.0, "epoch": 0.888235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.00882669910788536, "kl": 0.0023243827745318413, "learning_rate": 4.441176470588235e-07, "loss": 2.314654011570383e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 447.875, "completions/min_length": 394.0, "epoch": 0.8897058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.022288840264081955, "kl": 0.0027907488984055817, "learning_rate": 4.448529411764705e-07, "loss": 2.8101190764573403e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 473.125, "completions/min_length": 426.0, "epoch": 0.8911764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.599250316619873, "kl": 0.003195220197085291, "learning_rate": 4.4558823529411764e-07, "loss": 3.215298056602478e-05, "reward": 0.7746666669845581, "reward_std": 0.35917526483535767, "rewards/DrugCombAccuracyCOTORM/mean": 0.7287499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.42015671730041504, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.14907120168209076, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 437.25, "completions/min_length": 387.0, "epoch": 0.8926470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.006941945757716894, "kl": 0.0018000253185164183, "learning_rate": 4.463235294117647e-07, "loss": 1.7841339285951108e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 489.75, "completions/min_length": 440.0, "epoch": 0.8941176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.8714224696159363, "kl": 0.0022446077200584114, "learning_rate": 4.470588235294118e-07, "loss": 2.2305523089016788e-05, "reward": 0.75, "reward_std": 0.030860671773552895, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.6666666865348816, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 399.5625, "completions/min_length": 332.0, "epoch": 0.8955882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.005036017391830683, "kl": 0.0017927037551999092, "learning_rate": 4.477941176470588e-07, "loss": 1.7903712432598695e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 476.875, "completions/min_length": 373.0, "epoch": 0.8970588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.0261095762252808, "kl": 0.0027447067841421813, "learning_rate": 4.485294117647059e-07, "loss": 2.7000904083251953e-05, "reward": 0.644058346748352, "reward_std": 0.008650270290672779, "rewards/DrugCombAccuracyCOTORM/mean": 0.5850208401679993, "rewards/DrugCombAccuracyCOTORM/std": 0.4286993443965912, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7604166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.25069350004196167, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 406.75, "completions/min_length": 358.0, "epoch": 0.8985294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.005585739854723215, "kl": 0.002063571155304089, "learning_rate": 4.4926470588235296e-07, "loss": 2.060295082628727e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/mean_length": 409.9375, "completions/min_length": 359.0, "epoch": 0.9, "frac_reward_zero_std": 1.0, "grad_norm": 0.012900418601930141, "kl": 0.0024883788428269327, "learning_rate": 4.5e-07, "loss": 2.4775306883384474e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/mean_length": 470.6875, "completions/min_length": 376.0, "epoch": 0.9014705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9746097326278687, "kl": 0.0023357945028692484, "learning_rate": 4.5073529411764705e-07, "loss": 2.333366865059361e-05, "reward": 0.36158332228660583, "reward_std": 0.19466829299926758, "rewards/DrugCombAccuracyCOTORM/mean": 0.3400000035762787, "rewards/DrugCombAccuracyCOTORM/std": 0.4652741253376007, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.1041666567325592, "rewards/DrugCombCoverageCOTORM/std": 0.9562174677848816, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/mean_length": 420.3125, "completions/min_length": 365.0, "epoch": 0.9029411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.007198035251349211, "kl": 0.002324432280147448, "learning_rate": 4.5147058823529413e-07, "loss": 2.3101783881429583e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 476.8125, "completions/min_length": 425.0, "epoch": 0.9044117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.9448323845863342, "kl": 0.0026190813805442303, "learning_rate": 4.5220588235294115e-07, "loss": 2.5787530830712058e-05, "reward": 0.6784166693687439, "reward_std": 0.20055413246154785, "rewards/DrugCombAccuracyCOTORM/mean": 0.6318750381469727, "rewards/DrugCombAccuracyCOTORM/std": 0.4915176331996918, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7291666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.4425306022167206, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 404.75, "completions/min_length": 353.0, "epoch": 0.9058823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.2650022506713867, "kl": 0.0053657652460969985, "learning_rate": 4.529411764705882e-07, "loss": 5.3150684834690765e-05, "reward": 0.7281666994094849, "reward_std": 0.10523491352796555, "rewards/DrugCombAccuracyCOTORM/mean": 0.6758333444595337, "rewards/DrugCombAccuracyCOTORM/std": 0.3739350438117981, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.1666666567325592, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 442.75, "completions/min_length": 397.0, "epoch": 0.9073529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.1132863759994507, "kl": 0.002314081008080393, "learning_rate": 4.536764705882353e-07, "loss": 2.3014843463897705e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 455.125, "completions/min_length": 366.0, "epoch": 0.9088235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.566605806350708, "kl": 0.0025587190757505596, "learning_rate": 4.544117647058823e-07, "loss": 2.5641173124313354e-05, "reward": 0.5687500238418579, "reward_std": 0.43991678953170776, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 477.0, "completions/min_length": 424.0, "epoch": 0.9102941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.6921964883804321, "kl": 0.0030837689992040396, "learning_rate": 4.551470588235294e-07, "loss": 3.120303153991699e-05, "reward": 0.7437499761581421, "reward_std": 0.3729080259799957, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 437.5, "completions/min_length": 356.0, "epoch": 0.9117647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.085262417793274, "kl": 0.0026341721531935036, "learning_rate": 4.5588235294117646e-07, "loss": 2.60770320892334e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 409.9375, "completions/min_length": 353.0, "epoch": 0.913235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.006432591937482357, "kl": 0.002200163376983255, "learning_rate": 4.566176470588235e-07, "loss": 2.1907948394073173e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 455.125, "completions/min_length": 400.0, "epoch": 0.9147058823529411, "frac_reward_zero_std": 0.0, "grad_norm": 1.462999701499939, "kl": 0.0031943831709213555, "learning_rate": 4.5735294117647056e-07, "loss": 3.2238662242889404e-05, "reward": 0.687416672706604, "reward_std": 0.14292733371257782, "rewards/DrugCombAccuracyCOTORM/mean": 0.6587499976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.3996310830116272, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6041666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.3890872597694397, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 453.3125, "completions/min_length": 389.0, "epoch": 0.9161764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.8835386037826538, "kl": 0.002669125300599262, "learning_rate": 4.5808823529411763e-07, "loss": 2.6494264602661133e-05, "reward": 0.824999988079071, "reward_std": 0.37287637591362, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/mean_length": 495.5625, "completions/min_length": 414.0, "epoch": 0.9176470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.4753196239471436, "kl": 0.016746839217375964, "learning_rate": 4.5882352941176465e-07, "loss": 0.00017457455396652222, "reward": 0.35975000262260437, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.27000001072883606, "rewards/DrugCombAccuracyCOTORM/std": 0.3698107898235321, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.35939764976501465, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 452.625, "completions/min_length": 371.0, "epoch": 0.9191176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.3735750913619995, "kl": 0.002532153041101992, "learning_rate": 4.595588235294117e-07, "loss": 2.5054016077774577e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 442.5625, "completions/min_length": 354.0, "epoch": 0.9205882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.0987967252731323, "kl": 0.004287501506041735, "learning_rate": 4.602941176470588e-07, "loss": 4.2341111111454666e-05, "reward": 0.4437499940395355, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.0625, "rewards/DrugCombCoverageCOTORM/std": 0.9979145526885986, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 425.1875, "completions/min_length": 378.0, "epoch": 0.9220588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.006879126653075218, "kl": 0.0023714108974672854, "learning_rate": 4.6102941176470587e-07, "loss": 2.366257285757456e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 433.4375, "completions/min_length": 372.0, "epoch": 0.9235294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.010828005149960518, "kl": 0.0026608591433614492, "learning_rate": 4.6176470588235295e-07, "loss": 2.6628029445419088e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 430.25, "completions/min_length": 373.0, "epoch": 0.925, "frac_reward_zero_std": 1.0, "grad_norm": 0.006645023822784424, "kl": 0.002358889440074563, "learning_rate": 4.625e-07, "loss": 2.3465778212994337e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 427.625, "completions/min_length": 399.0, "epoch": 0.9264705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.376057744026184, "kl": 0.0027171659749001265, "learning_rate": 4.6323529411764704e-07, "loss": 2.7155505449627526e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 448.9375, "completions/min_length": 415.0, "epoch": 0.9279411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.9351078867912292, "kl": 0.002137232542736456, "learning_rate": 4.639705882352941e-07, "loss": 2.1607025701086968e-05, "reward": 0.656166672706604, "reward_std": 0.04289780929684639, "rewards/DrugCombAccuracyCOTORM/mean": 0.5962499976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.4203629493713379, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.2687419056892395, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 469.1875, "completions/min_length": 398.0, "epoch": 0.9294117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.009668312966823578, "kl": 0.003100784495472908, "learning_rate": 4.647058823529412e-07, "loss": 3.092344923061319e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 456.4375, "completions/min_length": 386.0, "epoch": 0.9308823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.008549903519451618, "kl": 0.0022007972875144333, "learning_rate": 4.654411764705882e-07, "loss": 2.1991534595144913e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 432.375, "completions/min_length": 332.0, "epoch": 0.9323529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9498463869094849, "kl": 0.0018823330465238541, "learning_rate": 4.661764705882353e-07, "loss": 1.884318407974206e-05, "reward": 0.885937511920929, "reward_std": 0.2112291157245636, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 449.25, "completions/min_length": 383.0, "epoch": 0.9338235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.3196178674697876, "kl": 0.0032065796258393675, "learning_rate": 4.6691176470588235e-07, "loss": 3.2065072446130216e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 466.4375, "completions/min_length": 401.0, "epoch": 0.9352941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.00577698927372694, "kl": 0.0019268043688498437, "learning_rate": 4.676470588235294e-07, "loss": 1.9264034563093446e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/mean_length": 449.875, "completions/min_length": 379.0, "epoch": 0.9367647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.1483157873153687, "kl": 0.0027986086497548968, "learning_rate": 4.6838235294117645e-07, "loss": 2.778677298920229e-05, "reward": 0.6937500238418579, "reward_std": 0.24266010522842407, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.8920949101448059, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/mean_length": 508.3125, "completions/min_length": 420.0, "epoch": 0.9382352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.011148719117045403, "kl": 0.0022427419607993215, "learning_rate": 4.691176470588235e-07, "loss": 2.2463136701844633e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/mean_length": 520.5, "completions/min_length": 428.0, "epoch": 0.9397058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.006693145260214806, "kl": 0.0024876218230929226, "learning_rate": 4.6985294117647054e-07, "loss": 2.4864224542398006e-05, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.12909944355487823, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 459.5, "completions/min_length": 420.0, "epoch": 0.9411764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.003208799520507455, "kl": 0.00150888369535096, "learning_rate": 4.705882352941176e-07, "loss": 1.51195636135526e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 447.375, "completions/min_length": 398.0, "epoch": 0.9426470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.0614728927612305, "kl": 0.0027891735080629587, "learning_rate": 4.713235294117647e-07, "loss": 2.8109916456742212e-05, "reward": 0.7734375, "reward_std": 0.2422400414943695, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/mean_length": 488.875, "completions/min_length": 411.0, "epoch": 0.9441176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9791005849838257, "kl": 0.002228934521554038, "learning_rate": 4.720588235294117e-07, "loss": 2.2366642951965332e-05, "reward": 0.4741874933242798, "reward_std": 0.1739361435174942, "rewards/DrugCombAccuracyCOTORM/mean": 0.34859374165534973, "rewards/DrugCombAccuracyCOTORM/std": 0.45981335639953613, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.953125, "rewards/DrugCombCoverageCOTORM/std": 0.10077822208404541, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 455.1875, "completions/min_length": 350.0, "epoch": 0.9455882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.9045499563217163, "kl": 0.0018997366423718631, "learning_rate": 4.727941176470588e-07, "loss": 1.8979719243361615e-05, "reward": 0.9760416746139526, "reward_std": 0.05553445965051651, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9270833730697632, "rewards/DrugCombCoverageCOTORM/std": 0.20155644416809082, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 445.125, "completions/min_length": 338.0, "epoch": 0.9470588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.184585452079773, "kl": 0.0029068634030409157, "learning_rate": 4.7352941176470586e-07, "loss": 2.9163667932152748e-05, "reward": 0.5977500081062317, "reward_std": 0.17456947267055511, "rewards/DrugCombAccuracyCOTORM/mean": 0.5831249952316284, "rewards/DrugCombAccuracyCOTORM/std": 0.49084240198135376, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3125, "rewards/DrugCombCoverageCOTORM/std": 0.9287087917327881, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 457.0, "completions/min_length": 411.0, "epoch": 0.9485294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 1.281004548072815, "kl": 0.0023159922566264868, "learning_rate": 4.742647058823529e-07, "loss": 2.298973049619235e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 474.0, "completions/min_length": 401.0, "epoch": 0.95, "frac_reward_zero_std": 0.5, "grad_norm": 0.8901208639144897, "kl": 0.0020539654651656747, "learning_rate": 4.7499999999999995e-07, "loss": 2.058970130747184e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 425.8125, "completions/min_length": 359.0, "epoch": 0.9514705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9761160016059875, "kl": 0.0021333052427507937, "learning_rate": 4.75735294117647e-07, "loss": 2.114175003953278e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 447.0, "completions/min_length": 380.0, "epoch": 0.9529411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.0158370733261108, "kl": 0.002537250053137541, "learning_rate": 4.7647058823529405e-07, "loss": 2.5406479835510254e-05, "reward": 0.512499988079071, "reward_std": 0.02314550243318081, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 476.5, "completions/min_length": 432.0, "epoch": 0.9544117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.9614681005477905, "kl": 0.0026682109164539725, "learning_rate": 4.772058823529412e-07, "loss": 2.648681402206421e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 420.6875, "completions/min_length": 348.0, "epoch": 0.9558823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.02042466588318348, "kl": 0.002866337134037167, "learning_rate": 4.779411764705882e-07, "loss": 2.9243157769087702e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 480.75, "completions/min_length": 386.0, "epoch": 0.9573529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 2.6109869480133057, "kl": 0.0025751354114618152, "learning_rate": 4.786764705882353e-07, "loss": 2.6203691959381104e-05, "reward": 0.6312500238418579, "reward_std": 0.4467061161994934, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 475.1875, "completions/min_length": 399.0, "epoch": 0.9588235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.6319403648376465, "kl": 0.0024840192636474967, "learning_rate": 4.794117647058823e-07, "loss": 2.5160610675811768e-05, "reward": 0.6089166402816772, "reward_std": 0.2576225996017456, "rewards/DrugCombAccuracyCOTORM/mean": 0.5137500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.5050000548362732, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 467.6875, "completions/min_length": 363.0, "epoch": 0.9602941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.01176303531974554, "kl": 0.0025475869479123503, "learning_rate": 4.801470588235294e-07, "loss": 2.5501609343336895e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/mean_length": 412.4375, "completions/min_length": 381.0, "epoch": 0.961764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.1625800132751465, "kl": 0.0027940458967350423, "learning_rate": 4.808823529411765e-07, "loss": 2.7827918529510498e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 427.0625, "completions/min_length": 371.0, "epoch": 0.9632352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.005792639683932066, "kl": 0.0020794083829969168, "learning_rate": 4.816176470588235e-07, "loss": 2.0924271666444838e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 525.125, "completions/min_length": 443.0, "epoch": 0.9647058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.411132574081421, "kl": 0.002450312487781048, "learning_rate": 4.823529411764705e-07, "loss": 2.442300319671631e-05, "reward": 0.5880833268165588, "reward_std": 0.4611999988555908, "rewards/DrugCombAccuracyCOTORM/mean": 0.5554167032241821, "rewards/DrugCombAccuracyCOTORM/std": 0.48696625232696533, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.8668268918991089, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 440.375, "completions/min_length": 401.0, "epoch": 0.9661764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0837570428848267, "kl": 0.00276977097382769, "learning_rate": 4.830882352941177e-07, "loss": 2.7777739887824282e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 423.75, "completions/min_length": 374.0, "epoch": 0.9676470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.006050726864486933, "kl": 0.001885165082057938, "learning_rate": 4.838235294117647e-07, "loss": 1.873756446002517e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/mean_length": 494.4375, "completions/min_length": 437.0, "epoch": 0.9691176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.014929175376892, "kl": 0.0023178886040113866, "learning_rate": 4.845588235294117e-07, "loss": 2.3230910301208496e-05, "reward": 0.7302261590957642, "reward_std": 0.19711560010910034, "rewards/DrugCombAccuracyCOTORM/mean": 0.6705952286720276, "rewards/DrugCombAccuracyCOTORM/std": 0.4716663956642151, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/mean_length": 502.9375, "completions/min_length": 437.0, "epoch": 0.9705882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 1.347926378250122, "kl": 0.0023236187116708606, "learning_rate": 4.852941176470588e-07, "loss": 2.3290514945983887e-05, "reward": 0.637499988079071, "reward_std": 0.4153292179107666, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 407.3125, "completions/min_length": 313.0, "epoch": 0.9720588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.2927937507629395, "kl": 0.0025532175204716623, "learning_rate": 4.860294117647058e-07, "loss": 2.5289435143349692e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 453.5625, "completions/min_length": 393.0, "epoch": 0.9735294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.1992555856704712, "kl": 0.0031145060202106833, "learning_rate": 4.867647058823529e-07, "loss": 3.111796468147077e-05, "reward": 0.75, "reward_std": 0.20701967179775238, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 485.3125, "completions/min_length": 380.0, "epoch": 0.975, "frac_reward_zero_std": 0.5, "grad_norm": 0.8451515436172485, "kl": 0.001985476032132283, "learning_rate": 4.875e-07, "loss": 2.0129893528064713e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 434.5625, "completions/min_length": 375.0, "epoch": 0.9764705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.07922230660915375, "kl": 0.003002183453645557, "learning_rate": 4.88235294117647e-07, "loss": 2.9904387702117674e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 489.3125, "completions/min_length": 397.0, "epoch": 0.9779411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.876001238822937, "kl": 0.0023846016847528517, "learning_rate": 4.88970588235294e-07, "loss": 2.3908913135528564e-05, "reward": 0.543749988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 416.25, "completions/min_length": 355.0, "epoch": 0.9794117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.4120051860809326, "kl": 0.0028047088999301195, "learning_rate": 4.897058823529412e-07, "loss": 2.7924776077270508e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 474.6875, "completions/min_length": 408.0, "epoch": 0.9808823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.005994720850139856, "kl": 0.0020394358434714377, "learning_rate": 4.904411764705882e-07, "loss": 2.0476578356465325e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 468.125, "completions/min_length": 394.0, "epoch": 0.9823529411764705, "frac_reward_zero_std": 0.0, "grad_norm": 1.5689152479171753, "kl": 0.0031506537343375385, "learning_rate": 4.911764705882352e-07, "loss": 3.139674663543701e-05, "reward": 0.21250000596046448, "reward_std": 0.3181980550289154, "rewards/DrugCombAccuracyCOTORM/mean": 0.125, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 444.75, "completions/min_length": 353.0, "epoch": 0.9838235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9509246349334717, "kl": 0.002961227495688945, "learning_rate": 4.919117647058823e-07, "loss": 2.987148764077574e-05, "reward": 0.7484375238418579, "reward_std": 0.20835641026496887, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.0625, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 480.25, "completions/min_length": 408.0, "epoch": 0.9852941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 0.9906057715415955, "kl": 0.003025366138899699, "learning_rate": 4.926470588235295e-07, "loss": 3.026060039701406e-05, "reward": 0.9051250219345093, "reward_std": 0.17601576447486877, "rewards/DrugCombAccuracyCOTORM/mean": 0.8853124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.314830482006073, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 462.5, "completions/min_length": 409.0, "epoch": 0.986764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9379611611366272, "kl": 0.002330527000594884, "learning_rate": 4.933823529411765e-07, "loss": 2.327561378479004e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 486.0, "completions/min_length": 421.0, "epoch": 0.9882352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.0026545524597168, "kl": 0.002427144325338304, "learning_rate": 4.941176470588235e-07, "loss": 2.397596836090088e-05, "reward": 0.8040000200271606, "reward_std": 0.2114352285861969, "rewards/DrugCombAccuracyCOTORM/mean": 0.7706249952316284, "rewards/DrugCombAccuracyCOTORM/std": 0.4125242531299591, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.28867512941360474, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 448.0, "completions/min_length": 383.0, "epoch": 0.9897058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.009380031377077103, "kl": 0.0024231366987805814, "learning_rate": 4.948529411764706e-07, "loss": 2.4006787498365156e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 460.875, "completions/min_length": 378.0, "epoch": 0.9911764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0990961790084839, "kl": 0.002302756387507543, "learning_rate": 4.955882352941176e-07, "loss": 2.2775322577217594e-05, "reward": 0.9239000082015991, "reward_std": 0.1409098207950592, "rewards/DrugCombAccuracyCOTORM/mean": 0.9079999923706055, "rewards/DrugCombAccuracyCOTORM/std": 0.25139185786247253, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9750000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.06831300258636475, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 452.5625, "completions/min_length": 421.0, "epoch": 0.9926470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8575478196144104, "kl": 0.002302175882505253, "learning_rate": 4.963235294117647e-07, "loss": 2.2956255634198897e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 484.5625, "completions/min_length": 423.0, "epoch": 0.9941176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.3955976963043213, "kl": 0.0022324625751934946, "learning_rate": 4.970588235294118e-07, "loss": 2.2172927856445312e-05, "reward": 0.6000000238418579, "reward_std": 0.43587726354599, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 387.5, "completions/min_length": 297.0, "epoch": 0.9955882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.007551355287432671, "kl": 0.002364010608289391, "learning_rate": 4.977941176470588e-07, "loss": 2.3542879716842435e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 458.75, "completions/min_length": 366.0, "epoch": 0.9970588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.8880283832550049, "kl": 0.0030504525639116764, "learning_rate": 4.985294117647058e-07, "loss": 3.0228973628254607e-05, "reward": 0.6312500238418579, "reward_std": 0.1510380655527115, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 473.0625, "completions/min_length": 389.0, "epoch": 0.9985294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9544476866722107, "kl": 0.002350116556044668, "learning_rate": 4.99264705882353e-07, "loss": 2.3363620130112395e-05, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 479.75, "completions/min_length": 445.0, "epoch": 1.0, "frac_reward_zero_std": 0.0, "grad_norm": 1.7407342195510864, "kl": 0.0034042687038891017, "learning_rate": 5e-07, "loss": 3.456324338912964e-05, "reward": 0.84375, "reward_std": 0.3102847933769226, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.35939764976501465, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 465.3125, "completions/min_length": 403.0, "epoch": 1.0014705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.006062894593924284, "kl": 0.0018183780484832823, "learning_rate": 5.00735294117647e-07, "loss": 1.820509169192519e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 468.8125, "completions/min_length": 397.0, "epoch": 1.0029411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.0301300287246704, "kl": 0.0033943868475034833, "learning_rate": 5.014705882352941e-07, "loss": 3.436917904764414e-05, "reward": 0.762499988079071, "reward_std": 0.25460052490234375, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.7187952995300293, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 422.0625, "completions/min_length": 373.0, "epoch": 1.0044117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.007209365721791983, "kl": 0.0022011962719261646, "learning_rate": 5.022058823529411e-07, "loss": 2.232191945950035e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 438.5625, "completions/min_length": 398.0, "epoch": 1.0058823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.04564662650227547, "kl": 0.00279955001315102, "learning_rate": 5.029411764705882e-07, "loss": 2.780569593596738e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 477.25, "completions/min_length": 385.0, "epoch": 1.0073529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.0445261001586914, "kl": 0.002668924367753789, "learning_rate": 5.036764705882353e-07, "loss": 2.625887282192707e-05, "reward": 0.885937511920929, "reward_std": 0.2112291157245636, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 450.125, "completions/min_length": 402.0, "epoch": 1.0088235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.0059999157674610615, "kl": 0.0019286128517705947, "learning_rate": 5.044117647058823e-07, "loss": 1.9322033040225506e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 475.0, "completions/min_length": 394.0, "epoch": 1.0102941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.00768984854221344, "kl": 0.0022438454325310886, "learning_rate": 5.051470588235293e-07, "loss": 2.2436914150603116e-05, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 491.5, "completions/min_length": 425.0, "epoch": 1.011764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.1019829511642456, "kl": 0.0026574507646728307, "learning_rate": 5.058823529411765e-07, "loss": 2.6428331693750806e-05, "reward": 0.3229166865348816, "reward_std": 0.23127280175685883, "rewards/DrugCombAccuracyCOTORM/mean": 0.1875, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7291666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.57373046875, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 458.3125, "completions/min_length": 431.0, "epoch": 1.013235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.009925564751029015, "kl": 0.001999460335355252, "learning_rate": 5.066176470588235e-07, "loss": 1.9856825019815005e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 458.4375, "completions/min_length": 388.0, "epoch": 1.0147058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.007469399366527796, "kl": 0.0025247559533454478, "learning_rate": 5.073529411764705e-07, "loss": 2.5375265977345407e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 459.625, "completions/min_length": 374.0, "epoch": 1.0161764705882352, "frac_reward_zero_std": 0.5, "grad_norm": 1.1130353212356567, "kl": 0.0023992031638044864, "learning_rate": 5.080882352941176e-07, "loss": 2.3921020328998566e-05, "reward": 0.887499988079071, "reward_std": 0.21001701056957245, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/mean_length": 531.5625, "completions/min_length": 460.0, "epoch": 1.0176470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 0.8774237632751465, "kl": 0.0022728071780875325, "learning_rate": 5.088235294117646e-07, "loss": 2.2932887077331543e-05, "reward": 0.4776666760444641, "reward_std": 0.17047721147537231, "rewards/DrugCombAccuracyCOTORM/mean": 0.35750001668930054, "rewards/DrugCombAccuracyCOTORM/std": 0.45492124557495117, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.2277100384235382, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 467.0, "completions/min_length": 428.0, "epoch": 1.0191176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.09195780009031296, "kl": 0.0031801176664885134, "learning_rate": 5.095588235294117e-07, "loss": 3.124789145658724e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 456.5, "completions/min_length": 418.0, "epoch": 1.0205882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.8746676445007324, "kl": 0.0024188524985220283, "learning_rate": 5.102941176470588e-07, "loss": 2.4137476430041716e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 505.3125, "completions/min_length": 428.0, "epoch": 1.0220588235294117, "frac_reward_zero_std": 0.0, "grad_norm": 1.3883793354034424, "kl": 0.0025866506330203265, "learning_rate": 5.110294117647058e-07, "loss": 2.5585293769836426e-05, "reward": 0.7250000238418579, "reward_std": 0.38195645809173584, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 445.0625, "completions/min_length": 354.0, "epoch": 1.0235294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.016782157123088837, "kl": 0.0035496445489116013, "learning_rate": 5.117647058823528e-07, "loss": 3.4596472687553614e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 454.9375, "completions/min_length": 382.0, "epoch": 1.025, "frac_reward_zero_std": 0.5, "grad_norm": 0.9693678021430969, "kl": 0.002084026869852096, "learning_rate": 5.125e-07, "loss": 2.0526349544525146e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 404.25, "completions/min_length": 351.0, "epoch": 1.026470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.4468799829483032, "kl": 0.0028551622526720166, "learning_rate": 5.13235294117647e-07, "loss": 2.8354363166727126e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/mean_length": 559.75, "completions/min_length": 448.0, "epoch": 1.0279411764705881, "frac_reward_zero_std": 0.5, "grad_norm": 0.76604825258255, "kl": 0.0018290492007508874, "learning_rate": 5.13970588235294e-07, "loss": 1.8222986909677275e-05, "reward": 0.7497314214706421, "reward_std": 0.022591233253479004, "rewards/DrugCombAccuracyCOTORM/mean": 0.695844829082489, "rewards/DrugCombAccuracyCOTORM/std": 0.31736138463020325, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9305555820465088, "rewards/DrugCombCoverageCOTORM/std": 0.10638079047203064, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/mean_length": 454.8125, "completions/min_length": 379.0, "epoch": 1.0294117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.006000600289553404, "kl": 0.0015707760176155716, "learning_rate": 5.147058823529411e-07, "loss": 1.5568090020678937e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 502.875, "completions/min_length": 427.0, "epoch": 1.0308823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.0267618894577026, "kl": 0.002470849722158164, "learning_rate": 5.154411764705882e-07, "loss": 2.4218354155891575e-05, "reward": 0.6596418619155884, "reward_std": 0.015800384804606438, "rewards/DrugCombAccuracyCOTORM/mean": 0.5999584794044495, "rewards/DrugCombAccuracyCOTORM/std": 0.41364601254463196, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7967510223388672, "rewards/DrugCombCoverageCOTORM/std": 0.2175408899784088, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 471.4375, "completions/min_length": 422.0, "epoch": 1.0323529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 0.9235272407531738, "kl": 0.0022338945709634572, "learning_rate": 5.161764705882353e-07, "loss": 2.2437601728597656e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/mean_length": 407.25, "completions/min_length": 366.0, "epoch": 1.0338235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.03357109799981117, "kl": 0.003429403412155807, "learning_rate": 5.169117647058824e-07, "loss": 3.4753484214888886e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/mean_length": 412.9375, "completions/min_length": 381.0, "epoch": 1.035294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.007240059785544872, "kl": 0.0022018830059096217, "learning_rate": 5.176470588235294e-07, "loss": 2.1848061805940233e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 451.1875, "completions/min_length": 384.0, "epoch": 1.036764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0306950807571411, "kl": 0.002552758698584512, "learning_rate": 5.183823529411764e-07, "loss": 2.56854673352791e-05, "reward": 0.5854166746139526, "reward_std": 0.02260337956249714, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.3435921370983124, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 494.6875, "completions/min_length": 459.0, "epoch": 1.0382352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.5101537704467773, "kl": 0.002430580963846296, "learning_rate": 5.191176470588236e-07, "loss": 2.4285167455673218e-05, "reward": 0.6678333282470703, "reward_std": 0.3591628074645996, "rewards/DrugCombAccuracyCOTORM/mean": 0.5900000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.4849192500114441, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 448.0, "completions/min_length": 388.0, "epoch": 1.0397058823529413, "frac_reward_zero_std": 0.5, "grad_norm": 1.2065688371658325, "kl": 0.0024369104066863656, "learning_rate": 5.198529411764706e-07, "loss": 2.4432392820017412e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 458.9375, "completions/min_length": 413.0, "epoch": 1.0411764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 1.0154454708099365, "kl": 0.002347619942156598, "learning_rate": 5.205882352941176e-07, "loss": 2.3084738131728955e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 450.5, "completions/min_length": 411.0, "epoch": 1.0426470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.017226481810212135, "kl": 0.002766235440503806, "learning_rate": 5.213235294117647e-07, "loss": 2.7692778530763462e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 431.8125, "completions/min_length": 382.0, "epoch": 1.0441176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.8584550619125366, "kl": 0.0021805143915116787, "learning_rate": 5.220588235294118e-07, "loss": 2.193743785028346e-05, "reward": 0.9775428771972656, "reward_std": 0.06351838260889053, "rewards/DrugCombAccuracyCOTORM/mean": 0.971928596496582, "rewards/DrugCombAccuracyCOTORM/std": 0.11228571832180023, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 424.0625, "completions/min_length": 315.0, "epoch": 1.0455882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.1815463304519653, "kl": 0.0021507905330508947, "learning_rate": 5.227941176470588e-07, "loss": 2.1620340703520924e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 452.5625, "completions/min_length": 408.0, "epoch": 1.0470588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.00854573491960764, "kl": 0.0024557280994486064, "learning_rate": 5.235294117647059e-07, "loss": 2.454844616295304e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/mean_length": 519.375, "completions/min_length": 466.0, "epoch": 1.048529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.046174168586731, "kl": 0.0022382422466762364, "learning_rate": 5.242647058823529e-07, "loss": 2.2359192371368408e-05, "reward": 0.7389583587646484, "reward_std": 0.10547676682472229, "rewards/DrugCombAccuracyCOTORM/mean": 0.682812511920929, "rewards/DrugCombAccuracyCOTORM/std": 0.3714519739151001, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9270833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.08539126813411713, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 434.625, "completions/min_length": 354.0, "epoch": 1.05, "frac_reward_zero_std": 1.0, "grad_norm": 0.015377702191472054, "kl": 0.002642574836499989, "learning_rate": 5.25e-07, "loss": 2.6094472559634596e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 445.0625, "completions/min_length": 394.0, "epoch": 1.0514705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.2853344678878784, "kl": 0.0030251205316744745, "learning_rate": 5.257352941176471e-07, "loss": 3.0246779715525918e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 448.0, "completions/min_length": 334.0, "epoch": 1.0529411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.107920527458191, "kl": 0.0022339776332955807, "learning_rate": 5.264705882352941e-07, "loss": 2.2396445274353027e-05, "reward": 0.25, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.0625, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/mean_length": 507.3125, "completions/min_length": 395.0, "epoch": 1.0544117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.8923659324645996, "kl": 0.002358848665608093, "learning_rate": 5.272058823529411e-07, "loss": 2.337247133255005e-05, "reward": 0.9476562738418579, "reward_std": 0.07224123179912567, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.13437095284461975, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9765625, "rewards/DrugCombCoverageCOTORM/std": 0.050389111042022705, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 472.0625, "completions/min_length": 414.0, "epoch": 1.0558823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.893306314945221, "kl": 0.0022604191035497934, "learning_rate": 5.279411764705882e-07, "loss": 2.2468180759460665e-05, "reward": 0.7749999761581421, "reward_std": 0.24348656833171844, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/mean_length": 465.125, "completions/min_length": 371.0, "epoch": 1.0573529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.004965552594512701, "kl": 0.0019508787372615188, "learning_rate": 5.286764705882353e-07, "loss": 1.9298695406178012e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 465.6875, "completions/min_length": 424.0, "epoch": 1.0588235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0947589874267578, "kl": 0.0021816669905092567, "learning_rate": 5.294117647058823e-07, "loss": 2.1929607100901194e-05, "reward": 0.45000001788139343, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 506.0, "completions/min_length": 455.0, "epoch": 1.0602941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.5414601564407349, "kl": 0.002204998629167676, "learning_rate": 5.301470588235294e-07, "loss": 2.1532177925109863e-05, "reward": 0.8031041622161865, "reward_std": 0.10737232863903046, "rewards/DrugCombAccuracyCOTORM/mean": 0.7629948258399963, "rewards/DrugCombAccuracyCOTORM/std": 0.19823989272117615, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9270833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.08539126813411713, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 459.9375, "completions/min_length": 431.0, "epoch": 1.061764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.005909634754061699, "kl": 0.002167692466173321, "learning_rate": 5.308823529411764e-07, "loss": 2.165001023968216e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 467.75, "completions/min_length": 420.0, "epoch": 1.063235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1770429611206055, "kl": 0.0022392409155145288, "learning_rate": 5.316176470588235e-07, "loss": 2.2426247596740723e-05, "reward": 0.7416666746139526, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.7083333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.3191423714160919, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/mean_length": 497.3125, "completions/min_length": 389.0, "epoch": 1.0647058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 3.7230353355407715, "kl": 0.04250630983733572, "learning_rate": 5.323529411764706e-07, "loss": 0.0004436598683241755, "reward": 0.8083125352859497, "reward_std": 0.18037275969982147, "rewards/DrugCombAccuracyCOTORM/mean": 0.7838281393051147, "rewards/DrugCombAccuracyCOTORM/std": 0.3460347354412079, "rewards/DrugCombCOTFormatORM/mean": 0.9375, "rewards/DrugCombCOTFormatORM/std": 0.17078252136707306, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.33592739701271057, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 444.75, "completions/min_length": 415.0, "epoch": 1.0661764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.7261974811553955, "kl": 0.0026523952838033438, "learning_rate": 5.330882352941176e-07, "loss": 2.6103109121322632e-05, "reward": 0.699999988079071, "reward_std": 0.3484410345554352, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 453.0, "completions/min_length": 383.0, "epoch": 1.0676470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.2546682357788086, "kl": 0.003160833672154695, "learning_rate": 5.338235294117646e-07, "loss": 3.1628234864911065e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/mean_length": 414.0625, "completions/min_length": 379.0, "epoch": 1.0691176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.006123048719018698, "kl": 0.002421122189844027, "learning_rate": 5.345588235294117e-07, "loss": 2.4171897166525014e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/mean_length": 479.5, "completions/min_length": 399.0, "epoch": 1.0705882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.4834522008895874, "kl": 0.00300450247596018, "learning_rate": 5.352941176470588e-07, "loss": 2.9549002647399902e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 482.4375, "completions/min_length": 433.0, "epoch": 1.0720588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.9519833922386169, "kl": 0.0030591603717766702, "learning_rate": 5.360294117647058e-07, "loss": 3.0800700187683105e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/mean_length": 509.0, "completions/min_length": 405.0, "epoch": 1.0735294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.858618974685669, "kl": 0.0021323018590919673, "learning_rate": 5.367647058823529e-07, "loss": 2.159768519049976e-05, "reward": 0.7781606912612915, "reward_std": 0.061770834028720856, "rewards/DrugCombAccuracyCOTORM/mean": 0.7266071438789368, "rewards/DrugCombAccuracyCOTORM/std": 0.2964767515659332, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 447.8125, "completions/min_length": 387.0, "epoch": 1.075, "frac_reward_zero_std": 0.5, "grad_norm": 1.3282861709594727, "kl": 0.0029163926956243813, "learning_rate": 5.374999999999999e-07, "loss": 2.903974564105738e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 419.25, "completions/min_length": 379.0, "epoch": 1.076470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.4470655918121338, "kl": 0.002551898709498346, "learning_rate": 5.382352941176471e-07, "loss": 2.5351995645905845e-05, "reward": 0.44999998807907104, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 439.625, "completions/min_length": 383.0, "epoch": 1.0779411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.006048028822988272, "kl": 0.0021640284394379705, "learning_rate": 5.389705882352941e-07, "loss": 2.1633510186802596e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 456.5625, "completions/min_length": 343.0, "epoch": 1.0794117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.004950343165546656, "kl": 0.001869183441158384, "learning_rate": 5.397058823529411e-07, "loss": 1.8689306671149097e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 459.5, "completions/min_length": 355.0, "epoch": 1.0808823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.004115428775548935, "kl": 0.0017157010443042964, "learning_rate": 5.404411764705882e-07, "loss": 1.72099989868002e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 436.8125, "completions/min_length": 393.0, "epoch": 1.0823529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.0951150432229042, "kl": 0.005019036965677515, "learning_rate": 5.411764705882353e-07, "loss": 4.9369285989087075e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/mean_length": 532.375, "completions/min_length": 448.0, "epoch": 1.0838235294117646, "frac_reward_zero_std": 0.0, "grad_norm": 1.436037540435791, "kl": 0.003251460730098188, "learning_rate": 5.419117647058823e-07, "loss": 3.24249267578125e-05, "reward": 0.26875001192092896, "reward_std": 0.36740854382514954, "rewards/DrugCombAccuracyCOTORM/mean": 0.1875, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.1875, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 431.3125, "completions/min_length": 392.0, "epoch": 1.0852941176470587, "frac_reward_zero_std": 1.0, "grad_norm": 0.005985742434859276, "kl": 0.002155869355192408, "learning_rate": 5.426470588235294e-07, "loss": 2.164393663406372e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 435.1875, "completions/min_length": 374.0, "epoch": 1.0867647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.1092250347137451, "kl": 0.0025389348447788507, "learning_rate": 5.433823529411764e-07, "loss": 2.5231616746168584e-05, "reward": 0.4552500247955322, "reward_std": 0.17580163478851318, "rewards/DrugCombAccuracyCOTORM/mean": 0.3841666579246521, "rewards/DrugCombAccuracyCOTORM/std": 0.49397483468055725, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5013870000839233, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 429.5625, "completions/min_length": 342.0, "epoch": 1.088235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01058998890221119, "kl": 0.0023219764116220176, "learning_rate": 5.441176470588234e-07, "loss": 2.338281956326682e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 430.375, "completions/min_length": 400.0, "epoch": 1.089705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.005373694933950901, "kl": 0.0021501537412405014, "learning_rate": 5.448529411764706e-07, "loss": 2.151768967451062e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 433.5, "completions/min_length": 355.0, "epoch": 1.0911764705882352, "frac_reward_zero_std": 0.5, "grad_norm": 0.9043271541595459, "kl": 0.002397782140178606, "learning_rate": 5.455882352941176e-07, "loss": 2.3908913135528564e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 449.5625, "completions/min_length": 388.0, "epoch": 1.0926470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.7409862279891968, "kl": 0.0036043838190380484, "learning_rate": 5.463235294117646e-07, "loss": 3.706654752022587e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 446.3125, "completions/min_length": 382.0, "epoch": 1.0941176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.1222888231277466, "kl": 0.00197338339057751, "learning_rate": 5.470588235294118e-07, "loss": 1.920759677886963e-05, "reward": 0.9089166522026062, "reward_std": 0.2576225697994232, "rewards/DrugCombAccuracyCOTORM/mean": 0.8887500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.30663496255874634, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 430.3125, "completions/min_length": 396.0, "epoch": 1.0955882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.006539901718497276, "kl": 0.0018720050575211644, "learning_rate": 5.477941176470589e-07, "loss": 1.879302362794988e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 462.9375, "completions/min_length": 419.0, "epoch": 1.0970588235294119, "frac_reward_zero_std": 0.5, "grad_norm": 1.1206635236740112, "kl": 0.00208460382418707, "learning_rate": 5.485294117647059e-07, "loss": 2.088397741317749e-05, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/mean_length": 405.625, "completions/min_length": 375.0, "epoch": 1.098529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.298921823501587, "kl": 0.0020838178461417556, "learning_rate": 5.49264705882353e-07, "loss": 2.100318670272827e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 400.0625, "completions/min_length": 318.0, "epoch": 1.1, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033718887716531754, "kl": 0.001607053738553077, "learning_rate": 5.5e-07, "loss": 1.5931494999676943e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 432.9375, "completions/min_length": 369.0, "epoch": 1.1014705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.005320705007761717, "kl": 0.0019521616632118821, "learning_rate": 5.50735294117647e-07, "loss": 1.9413659174460918e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 431.625, "completions/min_length": 354.0, "epoch": 1.1029411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.2713500261306763, "kl": 0.0033947385963983834, "learning_rate": 5.514705882352942e-07, "loss": 3.4030526876449585e-05, "reward": 0.887499988079071, "reward_std": 0.21001701056957245, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 489.3125, "completions/min_length": 438.0, "epoch": 1.1044117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 1.3174272775650024, "kl": 0.0021880584536120296, "learning_rate": 5.522058823529412e-07, "loss": 2.191215753555298e-05, "reward": 0.4945833086967468, "reward_std": 0.35526520013809204, "rewards/DrugCombAccuracyCOTORM/mean": 0.3812500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.44093653559684753, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.15957117080688477, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/mean_length": 449.25, "completions/min_length": 370.0, "epoch": 1.1058823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.0632330179214478, "kl": 0.002730886568315327, "learning_rate": 5.529411764705882e-07, "loss": 2.740323543548584e-05, "reward": 0.885937511920929, "reward_std": 0.2112291157245636, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/mean_length": 511.9375, "completions/min_length": 393.0, "epoch": 1.1073529411764707, "frac_reward_zero_std": 0.0, "grad_norm": 1.2822933197021484, "kl": 0.0022586706036236137, "learning_rate": 5.536764705882353e-07, "loss": 2.250075340270996e-05, "reward": 0.452666699886322, "reward_std": 0.3653903603553772, "rewards/DrugCombAccuracyCOTORM/mean": 0.42000001668930054, "rewards/DrugCombAccuracyCOTORM/std": 0.4144715368747711, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.1666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.9583938121795654, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 485.5, "completions/min_length": 438.0, "epoch": 1.1088235294117648, "frac_reward_zero_std": 0.5, "grad_norm": 1.1307979822158813, "kl": 0.002471551939379424, "learning_rate": 5.544117647058824e-07, "loss": 2.4631619453430176e-05, "reward": 0.25, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.0625, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 459.125, "completions/min_length": 406.0, "epoch": 1.1102941176470589, "frac_reward_zero_std": 0.0, "grad_norm": 1.488413691520691, "kl": 0.0025526518002152443, "learning_rate": 5.551470588235294e-07, "loss": 2.5570392608642578e-05, "reward": 0.5053333044052124, "reward_std": 0.2259000986814499, "rewards/DrugCombAccuracyCOTORM/mean": 0.4025000035762787, "rewards/DrugCombAccuracyCOTORM/std": 0.4833700954914093, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/mean_length": 508.8125, "completions/min_length": 371.0, "epoch": 1.111764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8082236647605896, "kl": 0.0031101740314625204, "learning_rate": 5.558823529411765e-07, "loss": 3.0600956961279735e-05, "reward": 0.6333333253860474, "reward_std": 0.1522320657968521, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 469.375, "completions/min_length": 386.0, "epoch": 1.113235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.076871395111084, "kl": 0.0030438179965130985, "learning_rate": 5.566176470588235e-07, "loss": 3.030414882232435e-05, "reward": 0.862500011920929, "reward_std": 0.22638463973999023, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/mean_length": 473.0625, "completions/min_length": 422.0, "epoch": 1.1147058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.1095025539398193, "kl": 0.0035810777044389397, "learning_rate": 5.573529411764706e-07, "loss": 3.524869680404663e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/mean_length": 559.875, "completions/min_length": 469.0, "epoch": 1.1161764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0811766386032104, "kl": 0.003621333191404119, "learning_rate": 5.580882352941177e-07, "loss": 3.522634506225586e-05, "reward": 0.8057562708854675, "reward_std": 0.12888102233409882, "rewards/DrugCombAccuracyCOTORM/mean": 0.7679375410079956, "rewards/DrugCombAccuracyCOTORM/std": 0.3163015842437744, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9140625, "rewards/DrugCombCoverageCOTORM/std": 0.18662993609905243, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 439.625, "completions/min_length": 374.0, "epoch": 1.1176470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.5456784963607788, "kl": 0.002474286942742765, "learning_rate": 5.588235294117647e-07, "loss": 2.429170854156837e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 493.9375, "completions/min_length": 433.0, "epoch": 1.1191176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.0699182748794556, "kl": 0.0026495156344026327, "learning_rate": 5.595588235294117e-07, "loss": 2.6457011699676514e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/mean_length": 490.875, "completions/min_length": 366.0, "epoch": 1.1205882352941177, "frac_reward_zero_std": 0.0, "grad_norm": 1.4959144592285156, "kl": 0.002391956077190116, "learning_rate": 5.602941176470588e-07, "loss": 2.4314969778060913e-05, "reward": 0.6517499685287476, "reward_std": 0.4106362760066986, "rewards/DrugCombAccuracyCOTORM/mean": 0.6037499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.4699627757072449, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.42979323863983154, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 481.9375, "completions/min_length": 420.0, "epoch": 1.1220588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.008382653817534447, "kl": 0.002453103312291205, "learning_rate": 5.610294117647059e-07, "loss": 2.445473182888236e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 456.125, "completions/min_length": 398.0, "epoch": 1.1235294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 2.183938503265381, "kl": 0.005527043744223192, "learning_rate": 5.617647058823529e-07, "loss": 5.380809307098389e-05, "reward": 0.8374999761581421, "reward_std": 0.34973084926605225, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 451.4375, "completions/min_length": 384.0, "epoch": 1.125, "frac_reward_zero_std": 1.0, "grad_norm": 0.008129220455884933, "kl": 0.002032590302405879, "learning_rate": 5.625e-07, "loss": 2.0370098354760557e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 463.6875, "completions/min_length": 411.0, "epoch": 1.1264705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.01562921330332756, "kl": 0.0027195096481591463, "learning_rate": 5.63235294117647e-07, "loss": 2.716545350267552e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 450.6875, "completions/min_length": 403.0, "epoch": 1.1279411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.0178580284118652, "kl": 0.0022234682110138237, "learning_rate": 5.639705882352941e-07, "loss": 2.2172927856445312e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 480.625, "completions/min_length": 393.0, "epoch": 1.1294117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.8496673703193665, "kl": 0.0025022936752066016, "learning_rate": 5.647058823529412e-07, "loss": 2.4968801881186664e-05, "reward": 0.6200833320617676, "reward_std": 0.06726156920194626, "rewards/DrugCombAccuracyCOTORM/mean": 0.5641666650772095, "rewards/DrugCombAccuracyCOTORM/std": 0.45625773072242737, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.5230785608291626, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 448.3125, "completions/min_length": 397.0, "epoch": 1.1308823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 1.636014699935913, "kl": 0.002485524833900854, "learning_rate": 5.654411764705882e-07, "loss": 2.4788081645965576e-05, "reward": 0.7089166641235352, "reward_std": 0.38346245884895325, "rewards/DrugCombAccuracyCOTORM/mean": 0.7012500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.46046173572540283, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.8858454823493958, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 460.1875, "completions/min_length": 355.0, "epoch": 1.1323529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9574422240257263, "kl": 0.0021188950049690902, "learning_rate": 5.661764705882352e-07, "loss": 2.1250476493150927e-05, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 443.3125, "completions/min_length": 386.0, "epoch": 1.1338235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.46097591519355774, "kl": 0.005259863275568932, "learning_rate": 5.669117647058823e-07, "loss": 5.3009644034318626e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 457.625, "completions/min_length": 396.0, "epoch": 1.1352941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.006815818138420582, "kl": 0.0019625753920990974, "learning_rate": 5.676470588235294e-07, "loss": 1.971063102246262e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 461.125, "completions/min_length": 407.0, "epoch": 1.136764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.1670557260513306, "kl": 0.004197925853077322, "learning_rate": 5.683823529411764e-07, "loss": 4.0460421587340534e-05, "reward": 0.8589166402816772, "reward_std": 0.19595398008823395, "rewards/DrugCombAccuracyCOTORM/mean": 0.8262500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.3764195442199707, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 421.125, "completions/min_length": 357.0, "epoch": 1.138235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.9971132278442383, "kl": 0.0039457191014662385, "learning_rate": 5.691176470588235e-07, "loss": 4.006922245025635e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 439.75, "completions/min_length": 372.0, "epoch": 1.1397058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 1.212207555770874, "kl": 0.0022354876855388284, "learning_rate": 5.698529411764705e-07, "loss": 2.2297535906545818e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 467.1875, "completions/min_length": 400.0, "epoch": 1.1411764705882352, "frac_reward_zero_std": 1.0, "grad_norm": 0.006918848026543856, "kl": 0.0024953159445431083, "learning_rate": 5.705882352941176e-07, "loss": 2.4687557015568018e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 450.8125, "completions/min_length": 318.0, "epoch": 1.1426470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 0.8335541486740112, "kl": 0.0021373797208070755, "learning_rate": 5.713235294117647e-07, "loss": 2.1261908841552213e-05, "reward": 0.9114583730697632, "reward_std": 0.0733194574713707, "rewards/DrugCombAccuracyCOTORM/mean": 0.8958333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.15957117080688477, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9479166269302368, "rewards/DrugCombCoverageCOTORM/std": 0.07978560030460358, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/mean_length": 413.8125, "completions/min_length": 364.0, "epoch": 1.1441176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.008255846798419952, "kl": 0.002619759237859398, "learning_rate": 5.720588235294117e-07, "loss": 2.6314995920984074e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 431.5, "completions/min_length": 407.0, "epoch": 1.1455882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.00972954835742712, "kl": 0.002732996246777475, "learning_rate": 5.727941176470587e-07, "loss": 2.750816202023998e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 473.625, "completions/min_length": 398.0, "epoch": 1.1470588235294117, "frac_reward_zero_std": 0.0, "grad_norm": 1.5696223974227905, "kl": 0.002993967500515282, "learning_rate": 5.735294117647059e-07, "loss": 2.9958784580230713e-05, "reward": 0.632437527179718, "reward_std": 0.4295129179954529, "rewards/DrugCombAccuracyCOTORM/mean": 0.6089062690734863, "rewards/DrugCombAccuracyCOTORM/std": 0.46558207273483276, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.453125, "rewards/DrugCombCoverageCOTORM/std": 0.8718693852424622, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 424.25, "completions/min_length": 339.0, "epoch": 1.1485294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.009075985290110111, "kl": 0.0021677800687029958, "learning_rate": 5.742647058823529e-07, "loss": 2.1776861103717238e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 457.625, "completions/min_length": 420.0, "epoch": 1.15, "frac_reward_zero_std": 1.0, "grad_norm": 0.021742166951298714, "kl": 0.003339457151014358, "learning_rate": 5.749999999999999e-07, "loss": 3.3240907214349136e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 461.0, "completions/min_length": 428.0, "epoch": 1.151470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 18.139379501342773, "kl": 0.004835316969547421, "learning_rate": 5.75735294117647e-07, "loss": 4.797428846359253e-05, "reward": 0.7437499761581421, "reward_std": 0.3729080259799957, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 405.625, "completions/min_length": 348.0, "epoch": 1.1529411764705881, "frac_reward_zero_std": 1.0, "grad_norm": 0.018092069774866104, "kl": 0.0021153729176148772, "learning_rate": 5.76470588235294e-07, "loss": 2.1170424588490278e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/mean_length": 573.875, "completions/min_length": 480.0, "epoch": 1.1544117647058822, "frac_reward_zero_std": 0.0, "grad_norm": 3.1198742389678955, "kl": 0.04142815194791183, "learning_rate": 5.772058823529411e-07, "loss": 0.00043722614645957947, "reward": 0.43642351031303406, "reward_std": 0.2293117791414261, "rewards/DrugCombAccuracyCOTORM/mean": 0.3649304211139679, "rewards/DrugCombAccuracyCOTORM/std": 0.38105374574661255, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.44479167461395264, "rewards/DrugCombCoverageCOTORM/std": 0.7481018304824829, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 458.1875, "completions/min_length": 412.0, "epoch": 1.1558823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.5960315465927124, "kl": 0.0025580982910469174, "learning_rate": 5.779411764705882e-07, "loss": 2.5406479835510254e-05, "reward": 0.629687488079071, "reward_std": 0.15439072251319885, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5439056158065796, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 471.25, "completions/min_length": 427.0, "epoch": 1.1573529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.9700007438659668, "kl": 0.003908236394636333, "learning_rate": 5.786764705882353e-07, "loss": 3.9126047340687364e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 436.25, "completions/min_length": 327.0, "epoch": 1.1588235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.004926392342895269, "kl": 0.0018784103158395737, "learning_rate": 5.794117647058823e-07, "loss": 1.8662727597984485e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 460.25, "completions/min_length": 412.0, "epoch": 1.160294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.6230723857879639, "kl": 0.0043531605042517185, "learning_rate": 5.801470588235295e-07, "loss": 4.430115222930908e-05, "reward": 0.6000000238418579, "reward_std": 0.37032803893089294, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 440.5, "completions/min_length": 386.0, "epoch": 1.161764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0573183298110962, "kl": 0.0028548818081617355, "learning_rate": 5.808823529411765e-07, "loss": 2.851084718713537e-05, "reward": 0.7124999761581421, "reward_std": 0.24164614081382751, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 492.0, "completions/min_length": 415.0, "epoch": 1.1632352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.3149951696395874, "kl": 0.0027117012650705874, "learning_rate": 5.816176470588235e-07, "loss": 2.7257949113845825e-05, "reward": 0.6169944405555725, "reward_std": 0.08996962010860443, "rewards/DrugCombAccuracyCOTORM/mean": 0.5672500133514404, "rewards/DrugCombAccuracyCOTORM/std": 0.46851646900177, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6319444179534912, "rewards/DrugCombCoverageCOTORM/std": 0.5393468737602234, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 399.875, "completions/min_length": 331.0, "epoch": 1.1647058823529413, "frac_reward_zero_std": 0.5, "grad_norm": 1.061741828918457, "kl": 0.0028102410724386573, "learning_rate": 5.823529411764706e-07, "loss": 2.7803482225863263e-05, "reward": 0.512499988079071, "reward_std": 0.0353553369641304, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 447.3125, "completions/min_length": 387.0, "epoch": 1.1661764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 1.1393448114395142, "kl": 0.0026001277728937566, "learning_rate": 5.830882352941177e-07, "loss": 2.5886880393954925e-05, "reward": 0.921625018119812, "reward_std": 0.14512230455875397, "rewards/DrugCombAccuracyCOTORM/mean": 0.9059374928474426, "rewards/DrugCombAccuracyCOTORM/std": 0.25702768564224243, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.08539126068353653, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 499.75, "completions/min_length": 443.0, "epoch": 1.1676470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.9726263284683228, "kl": 0.0024775838246569037, "learning_rate": 5.838235294117647e-07, "loss": 2.481788396835327e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 408.25, "completions/min_length": 346.0, "epoch": 1.1691176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.008501275442540646, "kl": 0.0027695088065229356, "learning_rate": 5.845588235294118e-07, "loss": 2.7566435164771974e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 487.875, "completions/min_length": 425.0, "epoch": 1.1705882352941177, "frac_reward_zero_std": 0.0, "grad_norm": 1.2864696979522705, "kl": 0.0019098717311862856, "learning_rate": 5.852941176470588e-07, "loss": 1.9103288650512695e-05, "reward": 0.643750011920929, "reward_std": 0.3442630469799042, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 470.3125, "completions/min_length": 405.0, "epoch": 1.1720588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9694625735282898, "kl": 0.003127374511677772, "learning_rate": 5.860294117647058e-07, "loss": 3.112480044364929e-05, "reward": 0.6517113447189331, "reward_std": 0.20101265609264374, "rewards/DrugCombAccuracyCOTORM/mean": 0.6160714626312256, "rewards/DrugCombAccuracyCOTORM/std": 0.49409782886505127, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6041666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.71200031042099, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/mean_length": 561.3125, "completions/min_length": 399.0, "epoch": 1.173529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.6054136753082275, "kl": 0.0025815379922278225, "learning_rate": 5.86764705882353e-07, "loss": 2.5998800992965698e-05, "reward": 0.7092708349227905, "reward_std": 0.33573272824287415, "rewards/DrugCombAccuracyCOTORM/mean": 0.675000011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.4358898997306824, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6927083730697632, "rewards/DrugCombCoverageCOTORM/std": 0.4469223618507385, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 486.1875, "completions/min_length": 393.0, "epoch": 1.175, "frac_reward_zero_std": 0.5, "grad_norm": 1.5261104106903076, "kl": 0.003839221957605332, "learning_rate": 5.875e-07, "loss": 3.828915214398876e-05, "reward": 0.75, "reward_std": 0.26726123690605164, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 439.6875, "completions/min_length": 384.0, "epoch": 1.1764705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 0.949349045753479, "kl": 0.0029093720368109643, "learning_rate": 5.88235294117647e-07, "loss": 2.893149394367356e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/mean_length": 417.5, "completions/min_length": 350.0, "epoch": 1.1779411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.015066209249198437, "kl": 0.003340527357067913, "learning_rate": 5.889705882352941e-07, "loss": 3.317916707601398e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 442.3125, "completions/min_length": 412.0, "epoch": 1.1794117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.005324806086719036, "kl": 0.002260951412608847, "learning_rate": 5.897058823529412e-07, "loss": 2.2626176360063255e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 469.625, "completions/min_length": 395.0, "epoch": 1.1808823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.059207558631897, "kl": 0.0028064309153705835, "learning_rate": 5.904411764705882e-07, "loss": 2.816319465637207e-05, "reward": 0.8187500238418579, "reward_std": 0.23895233869552612, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.704154372215271, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/mean_length": 497.875, "completions/min_length": 435.0, "epoch": 1.1823529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.3472695350646973, "kl": 0.0025820295559242368, "learning_rate": 5.911764705882353e-07, "loss": 2.5700777769088745e-05, "reward": 0.3687500059604645, "reward_std": 0.3743184804916382, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 440.0, "completions/min_length": 372.0, "epoch": 1.1838235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8230543732643127, "kl": 0.002279244305100292, "learning_rate": 5.919117647058823e-07, "loss": 2.283849607920274e-05, "reward": 0.2750000059604645, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.25, "rewards/DrugCombCoverageCOTORM/std": 0.8563488721847534, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/mean_length": 556.0625, "completions/min_length": 427.0, "epoch": 1.1852941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.1625080108642578, "kl": 0.0028908338863402605, "learning_rate": 5.926470588235293e-07, "loss": 2.854748527170159e-05, "reward": 0.6616071462631226, "reward_std": 0.15797266364097595, "rewards/DrugCombAccuracyCOTORM/mean": 0.6160714626312256, "rewards/DrugCombAccuracyCOTORM/std": 0.45912888646125793, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.6718547940254211, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 456.6875, "completions/min_length": 359.0, "epoch": 1.186764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.005238201003521681, "kl": 0.002011987118748948, "learning_rate": 5.933823529411765e-07, "loss": 2.022631815634668e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/mean_length": 527.4375, "completions/min_length": 453.0, "epoch": 1.188235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.1079522371292114, "kl": 0.002209694357588887, "learning_rate": 5.941176470588235e-07, "loss": 2.2076070308685303e-05, "reward": 0.746577799320221, "reward_std": 0.27045726776123047, "rewards/DrugCombAccuracyCOTORM/mean": 0.7109999656677246, "rewards/DrugCombAccuracyCOTORM/std": 0.36538928747177124, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7777777910232544, "rewards/DrugCombCoverageCOTORM/std": 0.25337231159210205, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 424.9375, "completions/min_length": 341.0, "epoch": 1.1897058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.9548782110214233, "kl": 0.0026077368238475174, "learning_rate": 5.948529411764705e-07, "loss": 2.5864554118015803e-05, "reward": 0.824999988079071, "reward_std": 0.24348658323287964, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 482.125, "completions/min_length": 405.0, "epoch": 1.1911764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.006277016829699278, "kl": 0.0022515577729791403, "learning_rate": 5.955882352941176e-07, "loss": 2.2558400814887136e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 446.375, "completions/min_length": 375.0, "epoch": 1.1926470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.63290536403656, "kl": 0.00288276223000139, "learning_rate": 5.963235294117647e-07, "loss": 2.902001142501831e-05, "reward": 0.737500011920929, "reward_std": 0.39058569073677063, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 479.5, "completions/min_length": 419.0, "epoch": 1.1941176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.1883307695388794, "kl": 0.0022667816083412617, "learning_rate": 5.970588235294117e-07, "loss": 2.272163692396134e-05, "reward": 0.5921875238418579, "reward_std": 0.022097086533904076, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 481.3125, "completions/min_length": 411.0, "epoch": 1.1955882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.005939153954386711, "kl": 0.001965521980309859, "learning_rate": 5.977941176470588e-07, "loss": 1.963391332537867e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 472.6875, "completions/min_length": 394.0, "epoch": 1.1970588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.9985728859901428, "kl": 0.0027730175061151385, "learning_rate": 5.985294117647058e-07, "loss": 2.7915684768231586e-05, "reward": 0.8562500476837158, "reward_std": 0.059885792434215546, "rewards/DrugCombAccuracyCOTORM/mean": 0.84375, "rewards/DrugCombAccuracyCOTORM/std": 0.18726837635040283, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 411.1875, "completions/min_length": 385.0, "epoch": 1.1985294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.01183510385453701, "kl": 0.003238190256524831, "learning_rate": 5.992647058823528e-07, "loss": 3.2572876079939306e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 442.1875, "completions/min_length": 397.0, "epoch": 1.2, "frac_reward_zero_std": 0.5, "grad_norm": 1.0495671033859253, "kl": 0.0026596187381073833, "learning_rate": 6e-07, "loss": 2.6466344934306107e-05, "reward": 0.8187500238418579, "reward_std": 0.2506242096424103, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.704154372215271, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/mean_length": 453.5, "completions/min_length": 351.0, "epoch": 1.201470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.1122835874557495, "kl": 0.002979802433401346, "learning_rate": 6.00735294117647e-07, "loss": 3.0032688300707377e-05, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 460.0625, "completions/min_length": 407.0, "epoch": 1.2029411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.3890800476074219, "kl": 0.0025465379585511982, "learning_rate": 6.01470588235294e-07, "loss": 2.539902925491333e-05, "reward": 0.71875, "reward_std": 0.43578821420669556, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.704154372215271, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 461.625, "completions/min_length": 412.0, "epoch": 1.2044117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.9670411348342896, "kl": 0.0025865208881441504, "learning_rate": 6.022058823529411e-07, "loss": 2.5890767574310303e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/mean_length": 495.25, "completions/min_length": 403.0, "epoch": 1.2058823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.5052450895309448, "kl": 0.0026484369300305843, "learning_rate": 6.029411764705882e-07, "loss": 2.6889145374298096e-05, "reward": 0.6039999723434448, "reward_std": 0.08923004567623138, "rewards/DrugCombAccuracyCOTORM/mean": 0.5518749952316284, "rewards/DrugCombAccuracyCOTORM/std": 0.4782881736755371, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.4654746949672699, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/mean_length": 539.0, "completions/min_length": 454.0, "epoch": 1.2073529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.7562388777732849, "kl": 0.0020378212502691895, "learning_rate": 6.036764705882352e-07, "loss": 2.0422041416168213e-05, "reward": 0.9312500357627869, "reward_std": 0.03282996639609337, "rewards/DrugCombAccuracyCOTORM/mean": 0.9166666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.09759000688791275, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 471.5, "completions/min_length": 439.0, "epoch": 1.2088235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 1.1167068481445312, "kl": 0.002699631208088249, "learning_rate": 6.044117647058823e-07, "loss": 2.691894769668579e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 433.8125, "completions/min_length": 370.0, "epoch": 1.2102941176470587, "frac_reward_zero_std": 1.0, "grad_norm": 0.022007694467902184, "kl": 0.0024713608145248145, "learning_rate": 6.051470588235293e-07, "loss": 2.425056300126016e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 429.8125, "completions/min_length": 369.0, "epoch": 1.2117647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.012648524716496468, "kl": 0.002626040019094944, "learning_rate": 6.058823529411763e-07, "loss": 2.6649020583136007e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 465.0, "completions/min_length": 427.0, "epoch": 1.213235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.004112216178327799, "kl": 0.0017924074782058597, "learning_rate": 6.066176470588235e-07, "loss": 1.7876078345580027e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 425.75, "completions/min_length": 391.0, "epoch": 1.214705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.011481601744890213, "kl": 0.0026835135358851403, "learning_rate": 6.073529411764705e-07, "loss": 2.6399726266390644e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 438.6875, "completions/min_length": 379.0, "epoch": 1.2161764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 1.0926405191421509, "kl": 0.0029841757495887578, "learning_rate": 6.080882352941175e-07, "loss": 2.9898284992668778e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 439.3125, "completions/min_length": 380.0, "epoch": 1.2176470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.8526691198348999, "kl": 0.002074522024486214, "learning_rate": 6.088235294117646e-07, "loss": 2.0604580640792847e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 454.0625, "completions/min_length": 391.0, "epoch": 1.2191176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.0545457601547241, "kl": 0.002307216782355681, "learning_rate": 6.095588235294118e-07, "loss": 2.2999942302703857e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 458.625, "completions/min_length": 414.0, "epoch": 1.2205882352941178, "frac_reward_zero_std": 0.5, "grad_norm": 1.0291531085968018, "kl": 0.0024107783392537385, "learning_rate": 6.102941176470589e-07, "loss": 2.431349093967583e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 442.625, "completions/min_length": 401.0, "epoch": 1.2220588235294119, "frac_reward_zero_std": 1.0, "grad_norm": 0.0065580327063798904, "kl": 0.0024158708984032273, "learning_rate": 6.110294117647059e-07, "loss": 2.4282546291942708e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 418.0, "completions/min_length": 304.0, "epoch": 1.223529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.0937073230743408, "kl": 0.0022044295619707555, "learning_rate": 6.117647058823529e-07, "loss": 2.199208756792359e-05, "reward": 0.565541684627533, "reward_std": 0.03264476731419563, "rewards/DrugCombAccuracyCOTORM/mean": 0.5103124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.5073147416114807, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5729166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.4790761172771454, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 487.625, "completions/min_length": 429.0, "epoch": 1.225, "frac_reward_zero_std": 0.5, "grad_norm": 1.6237648725509644, "kl": 0.002607107162475586, "learning_rate": 6.125000000000001e-07, "loss": 2.6270747184753418e-05, "reward": 0.4312500059604645, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/mean_length": 512.1875, "completions/min_length": 422.0, "epoch": 1.2264705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 0.9544015526771545, "kl": 0.0023483002150896937, "learning_rate": 6.132352941176471e-07, "loss": 2.3275784769793972e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 468.0625, "completions/min_length": 413.0, "epoch": 1.2279411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.431408405303955, "kl": 0.003224285726901144, "learning_rate": 6.139705882352941e-07, "loss": 3.2543004635954276e-05, "reward": 0.6484375, "reward_std": 0.14212003350257874, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.0625, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 470.75, "completions/min_length": 405.0, "epoch": 1.2294117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 3.5321860313415527, "kl": 0.0028455361316446215, "learning_rate": 6.147058823529412e-07, "loss": 2.8442591428756714e-05, "reward": 0.550000011920929, "reward_std": 0.46579423546791077, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 466.0625, "completions/min_length": 423.0, "epoch": 1.2308823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.004237243440002203, "kl": 0.0016569656145293266, "learning_rate": 6.154411764705883e-07, "loss": 1.6514584785909392e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 415.3125, "completions/min_length": 321.0, "epoch": 1.2323529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 1.227496862411499, "kl": 0.003366249438840896, "learning_rate": 6.161764705882353e-07, "loss": 3.3430980693083256e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/mean_length": 404.9375, "completions/min_length": 366.0, "epoch": 1.2338235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.01204429380595684, "kl": 0.003031549887964502, "learning_rate": 6.169117647058824e-07, "loss": 3.0263599910540506e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 398.1875, "completions/min_length": 324.0, "epoch": 1.2352941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 1.3741432428359985, "kl": 0.0027697172481566668, "learning_rate": 6.176470588235294e-07, "loss": 2.766392753983382e-05, "reward": 0.8767499923706055, "reward_std": 0.17010116577148438, "rewards/DrugCombAccuracyCOTORM/mean": 0.8537499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.31442803144454956, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.13437095284461975, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 484.1875, "completions/min_length": 414.0, "epoch": 1.236764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.00913504883646965, "kl": 0.0021619645413011312, "learning_rate": 6.183823529411764e-07, "loss": 2.1690986613975838e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 436.5625, "completions/min_length": 379.0, "epoch": 1.238235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01033686101436615, "kl": 0.002533661958295852, "learning_rate": 6.191176470588236e-07, "loss": 2.5271532649639994e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 438.125, "completions/min_length": 393.0, "epoch": 1.2397058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.4785829782485962, "kl": 0.0031240051612257957, "learning_rate": 6.198529411764706e-07, "loss": 3.1251460313797e-05, "reward": 0.699999988079071, "reward_std": 0.3484410345554352, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/mean_length": 538.25, "completions/min_length": 474.0, "epoch": 1.2411764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.5185211896896362, "kl": 0.0034124209196306765, "learning_rate": 6.205882352941176e-07, "loss": 3.340751209179871e-05, "reward": 0.3187499940395355, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.1875, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 445.4375, "completions/min_length": 410.0, "epoch": 1.2426470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.01572156697511673, "kl": 0.0027992784744128585, "learning_rate": 6.213235294117647e-07, "loss": 2.7915259124711156e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 422.4375, "completions/min_length": 350.0, "epoch": 1.2441176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.009172092191874981, "kl": 0.002433111803838983, "learning_rate": 6.220588235294118e-07, "loss": 2.4419688998023048e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 467.8125, "completions/min_length": 418.0, "epoch": 1.2455882352941177, "frac_reward_zero_std": 0.0, "grad_norm": 1.5013129711151123, "kl": 0.002460711693856865, "learning_rate": 6.227941176470588e-07, "loss": 2.4631619453430176e-05, "reward": 0.7937500476837158, "reward_std": 0.36611872911453247, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 402.1875, "completions/min_length": 365.0, "epoch": 1.2470588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.0061597260646522045, "kl": 0.0020838282362092286, "learning_rate": 6.235294117647059e-07, "loss": 2.0904351913486607e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/mean_length": 509.8125, "completions/min_length": 370.0, "epoch": 1.2485294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.8470466732978821, "kl": 0.00227993429871276, "learning_rate": 6.242647058823529e-07, "loss": 2.2934789740247652e-05, "reward": 0.6432916522026062, "reward_std": 0.0952177420258522, "rewards/DrugCombAccuracyCOTORM/mean": 0.5905728936195374, "rewards/DrugCombAccuracyCOTORM/std": 0.45101311802864075, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7083333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.32489314675331116, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/mean_length": 470.5625, "completions/min_length": 395.0, "epoch": 1.25, "frac_reward_zero_std": 0.5, "grad_norm": 0.9141653180122375, "kl": 0.0024406975717283785, "learning_rate": 6.249999999999999e-07, "loss": 2.4266541004180908e-05, "reward": 0.8958333134651184, "reward_std": 0.11347679048776627, "rewards/DrugCombAccuracyCOTORM/mean": 0.8854166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.2083333432674408, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.22360680997371674, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 438.375, "completions/min_length": 394.0, "epoch": 1.2514705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.004957954399287701, "kl": 0.002286387432832271, "learning_rate": 6.257352941176471e-07, "loss": 2.265741932205856e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 464.3125, "completions/min_length": 417.0, "epoch": 1.2529411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9167082905769348, "kl": 0.0018071202212013304, "learning_rate": 6.264705882352941e-07, "loss": 1.8075108528137207e-05, "reward": 0.7945833206176758, "reward_std": 0.17010116577148438, "rewards/DrugCombAccuracyCOTORM/mean": 0.7562500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.3733965754508972, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.15957117080688477, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 416.125, "completions/min_length": 356.0, "epoch": 1.2544117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.005037019029259682, "kl": 0.0020699629094451666, "learning_rate": 6.272058823529411e-07, "loss": 2.072072857117746e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 461.75, "completions/min_length": 405.0, "epoch": 1.2558823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.015477410517632961, "kl": 0.002743309538345784, "learning_rate": 6.279411764705882e-07, "loss": 2.7268284611636773e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 463.5625, "completions/min_length": 419.0, "epoch": 1.2573529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.67410409450531, "kl": 0.003122462483588606, "learning_rate": 6.286764705882353e-07, "loss": 3.094598650932312e-05, "reward": 0.699999988079071, "reward_std": 0.40868258476257324, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 431.6875, "completions/min_length": 384.0, "epoch": 1.2588235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.06958736479282379, "kl": 0.004279452434275299, "learning_rate": 6.294117647058823e-07, "loss": 4.333600008976646e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 411.625, "completions/min_length": 334.0, "epoch": 1.2602941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.006788746919482946, "kl": 0.001998966297833249, "learning_rate": 6.301470588235294e-07, "loss": 2.0026220227009617e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 438.125, "completions/min_length": 353.0, "epoch": 1.261764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.004054328892379999, "kl": 0.0018376236548647285, "learning_rate": 6.308823529411764e-07, "loss": 1.8262455341755413e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 429.0, "completions/min_length": 380.0, "epoch": 1.263235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.038305401802063, "kl": 0.003001376986503601, "learning_rate": 6.316176470588234e-07, "loss": 3.0152499675750732e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 490.9375, "completions/min_length": 441.0, "epoch": 1.2647058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 1.014643907546997, "kl": 0.0024743869435042143, "learning_rate": 6.323529411764706e-07, "loss": 2.4762099201325327e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/mean_length": 493.125, "completions/min_length": 361.0, "epoch": 1.2661764705882352, "frac_reward_zero_std": 0.5, "grad_norm": 0.8982363343238831, "kl": 0.0027934444369748235, "learning_rate": 6.330882352941176e-07, "loss": 2.7725563995772973e-05, "reward": 0.7740625143051147, "reward_std": 0.1982179880142212, "rewards/DrugCombAccuracyCOTORM/mean": 0.739062488079071, "rewards/DrugCombAccuracyCOTORM/std": 0.4072232246398926, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.33592739701271057, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 488.6875, "completions/min_length": 445.0, "epoch": 1.2676470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 1.0834769010543823, "kl": 0.0024695448810234666, "learning_rate": 6.338235294117646e-07, "loss": 2.4594366550445557e-05, "reward": 0.5625, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 434.3125, "completions/min_length": 370.0, "epoch": 1.2691176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.0392128229141235, "kl": 0.002479787013726309, "learning_rate": 6.345588235294117e-07, "loss": 2.4825334548950195e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 434.8125, "completions/min_length": 383.0, "epoch": 1.2705882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.013720612972974777, "kl": 0.0032396038295701146, "learning_rate": 6.352941176470588e-07, "loss": 3.1748510082252324e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/mean_length": 518.9375, "completions/min_length": 411.0, "epoch": 1.2720588235294117, "frac_reward_zero_std": 0.0, "grad_norm": 1.820258378982544, "kl": 0.0036382475518621504, "learning_rate": 6.360294117647058e-07, "loss": 3.6813318729400635e-05, "reward": 0.592348575592041, "reward_std": 0.3131115138530731, "rewards/DrugCombAccuracyCOTORM/mean": 0.5141336917877197, "rewards/DrugCombAccuracyCOTORM/std": 0.4140850901603699, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8104166388511658, "rewards/DrugCombCoverageCOTORM/std": 0.32804837822914124, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 430.75, "completions/min_length": 378.0, "epoch": 1.2735294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 1.1476351022720337, "kl": 0.0032396704773418605, "learning_rate": 6.367647058823529e-07, "loss": 3.2782554626464844e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 472.75, "completions/min_length": 385.0, "epoch": 1.275, "frac_reward_zero_std": 0.5, "grad_norm": 1.2654411792755127, "kl": 0.002780707029160112, "learning_rate": 6.374999999999999e-07, "loss": 2.802908420562744e-05, "reward": 0.8616636991500854, "reward_std": 0.1628040373325348, "rewards/DrugCombAccuracyCOTORM/mean": 0.8368452787399292, "rewards/DrugCombAccuracyCOTORM/std": 0.31330981850624084, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.921875, "rewards/DrugCombCoverageCOTORM/std": 0.14099103212356567, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 467.125, "completions/min_length": 419.0, "epoch": 1.276470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.827095627784729, "kl": 0.0017895005003083497, "learning_rate": 6.38235294117647e-07, "loss": 1.776963472366333e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 449.625, "completions/min_length": 380.0, "epoch": 1.2779411764705881, "frac_reward_zero_std": 1.0, "grad_norm": 0.012361708097159863, "kl": 0.0031607496784999967, "learning_rate": 6.389705882352941e-07, "loss": 3.1129686249187216e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/mean_length": 407.375, "completions/min_length": 366.0, "epoch": 1.2794117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.004050465766340494, "kl": 0.0017377616895828396, "learning_rate": 6.397058823529411e-07, "loss": 1.733746103127487e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 431.25, "completions/min_length": 380.0, "epoch": 1.2808823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.2541770935058594, "kl": 0.005138164910022169, "learning_rate": 6.404411764705881e-07, "loss": 5.200505256652832e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/mean_length": 491.9375, "completions/min_length": 405.0, "epoch": 1.2823529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.3882744312286377, "kl": 0.012236436537932605, "learning_rate": 6.411764705882354e-07, "loss": 0.0001247712061740458, "reward": 0.652396559715271, "reward_std": 0.04754616320133209, "rewards/DrugCombAccuracyCOTORM/mean": 0.5894322991371155, "rewards/DrugCombAccuracyCOTORM/std": 0.4294142425060272, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.808506965637207, "rewards/DrugCombCoverageCOTORM/std": 0.2427539825439453, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 460.1875, "completions/min_length": 404.0, "epoch": 1.2838235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.0077455476857721806, "kl": 0.001752247044350952, "learning_rate": 6.419117647058824e-07, "loss": 1.7575388483237475e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 489.875, "completions/min_length": 431.0, "epoch": 1.2852941176470587, "frac_reward_zero_std": 1.0, "grad_norm": 0.0047308229841291904, "kl": 0.002205205528298393, "learning_rate": 6.426470588235294e-07, "loss": 2.196069181081839e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/mean_length": 485.125, "completions/min_length": 420.0, "epoch": 1.2867647058823528, "frac_reward_zero_std": 0.5, "grad_norm": 1.0227773189544678, "kl": 0.003082536451984197, "learning_rate": 6.433823529411765e-07, "loss": 3.053879117942415e-05, "reward": 0.8999999761581421, "reward_std": 0.15456029772758484, "rewards/DrugCombAccuracyCOTORM/mean": 0.8854166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.2770128548145294, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 478.0, "completions/min_length": 401.0, "epoch": 1.288235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.006699942518025637, "kl": 0.0025797545386012644, "learning_rate": 6.441176470588235e-07, "loss": 2.5662688130978495e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/mean_length": 417.125, "completions/min_length": 388.0, "epoch": 1.2897058823529413, "frac_reward_zero_std": 0.5, "grad_norm": 1.1949609518051147, "kl": 0.002430928638204932, "learning_rate": 6.448529411764706e-07, "loss": 2.4244189262390137e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 473.125, "completions/min_length": 325.0, "epoch": 1.2911764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 0.9802259802818298, "kl": 0.002403542253887281, "learning_rate": 6.455882352941177e-07, "loss": 2.4133556507877074e-05, "reward": 0.9645833373069763, "reward_std": 0.06557891517877579, "rewards/DrugCombAccuracyCOTORM/mean": 0.9583333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.11385500431060791, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.05692751333117485, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 464.6875, "completions/min_length": 394.0, "epoch": 1.2926470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.008529416285455227, "kl": 0.002204637770773843, "learning_rate": 6.463235294117647e-07, "loss": 2.1939225916867144e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 461.4375, "completions/min_length": 399.0, "epoch": 1.2941176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.829594612121582, "kl": 0.002723144832998514, "learning_rate": 6.470588235294117e-07, "loss": 2.7033984224544838e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 447.5625, "completions/min_length": 401.0, "epoch": 1.2955882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.1670695543289185, "kl": 0.017520757450256497, "learning_rate": 6.477941176470589e-07, "loss": 0.00017052624025382102, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 439.4375, "completions/min_length": 371.0, "epoch": 1.2970588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.006003889720886946, "kl": 0.0020799135381821543, "learning_rate": 6.485294117647059e-07, "loss": 2.082637911371421e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 441.375, "completions/min_length": 380.0, "epoch": 1.298529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.005980238318443298, "kl": 0.0022988697746768594, "learning_rate": 6.492647058823529e-07, "loss": 2.2973092200118117e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 479.5625, "completions/min_length": 409.0, "epoch": 1.3, "frac_reward_zero_std": 0.0, "grad_norm": 1.7048476934432983, "kl": 0.0032347587984986603, "learning_rate": 6.5e-07, "loss": 3.217533230781555e-05, "reward": 0.5531249642372131, "reward_std": 0.3428332209587097, "rewards/DrugCombAccuracyCOTORM/mean": 0.53125, "rewards/DrugCombAccuracyCOTORM/std": 0.46435439586639404, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.28125, "rewards/DrugCombCoverageCOTORM/std": 0.9123002290725708, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 478.375, "completions/min_length": 421.0, "epoch": 1.3014705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.015580005012452602, "kl": 0.003052675776416436, "learning_rate": 6.50735294117647e-07, "loss": 3.0421804694924504e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 447.875, "completions/min_length": 344.0, "epoch": 1.3029411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 1.445448398590088, "kl": 0.002749880775809288, "learning_rate": 6.514705882352941e-07, "loss": 2.7060508728027344e-05, "reward": 0.3389166593551636, "reward_std": 0.24918676912784576, "rewards/DrugCombAccuracyCOTORM/mean": 0.20749999582767487, "rewards/DrugCombAccuracyCOTORM/std": 0.326751708984375, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7291666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.49018141627311707, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 434.375, "completions/min_length": 363.0, "epoch": 1.3044117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.01061164028942585, "kl": 0.002230825455626473, "learning_rate": 6.522058823529412e-07, "loss": 2.2229130991036072e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 508.0, "completions/min_length": 462.0, "epoch": 1.3058823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 1.3964965343475342, "kl": 0.0030160879250615835, "learning_rate": 6.529411764705882e-07, "loss": 3.016740083694458e-05, "reward": 0.4437500238418579, "reward_std": 0.38548365235328674, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.6291528940200806, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 444.75, "completions/min_length": 312.0, "epoch": 1.3073529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.007249837275594473, "kl": 0.0025573098100721836, "learning_rate": 6.536764705882352e-07, "loss": 2.5546694814693183e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 445.375, "completions/min_length": 346.0, "epoch": 1.3088235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.00905677955597639, "kl": 0.0026236355770379305, "learning_rate": 6.544117647058824e-07, "loss": 2.6059778974740766e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 424.5625, "completions/min_length": 374.0, "epoch": 1.3102941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9643595814704895, "kl": 0.0020399510685820132, "learning_rate": 6.551470588235294e-07, "loss": 2.0468887669267133e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/mean_length": 505.9375, "completions/min_length": 403.0, "epoch": 1.311764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.1116176843643188, "kl": 0.0025057139864657074, "learning_rate": 6.558823529411764e-07, "loss": 2.491474151611328e-05, "reward": 0.784044623374939, "reward_std": 0.22176438570022583, "rewards/DrugCombAccuracyCOTORM/mean": 0.7632589340209961, "rewards/DrugCombAccuracyCOTORM/std": 0.4261437654495239, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.734375, "rewards/DrugCombCoverageCOTORM/std": 0.4422362744808197, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 443.1875, "completions/min_length": 379.0, "epoch": 1.313235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.2447997331619263, "kl": 0.0036574299447238445, "learning_rate": 6.566176470588235e-07, "loss": 3.674080653581768e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 469.25, "completions/min_length": 427.0, "epoch": 1.3147058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.00730583630502224, "kl": 0.0027435519150458276, "learning_rate": 6.573529411764705e-07, "loss": 2.741108983173035e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 463.6875, "completions/min_length": 390.0, "epoch": 1.3161764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.011210411787033081, "kl": 0.003076387452892959, "learning_rate": 6.580882352941176e-07, "loss": 3.012514571310021e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 482.375, "completions/min_length": 367.0, "epoch": 1.3176470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8933215141296387, "kl": 0.0024149694363586605, "learning_rate": 6.588235294117647e-07, "loss": 2.368374953221064e-05, "reward": 0.6541666984558105, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5416666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.3824869990348816, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 482.5, "completions/min_length": 459.0, "epoch": 1.3191176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.007015984505414963, "kl": 0.0021689556015189737, "learning_rate": 6.595588235294117e-07, "loss": 2.1696416297345422e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 458.5625, "completions/min_length": 395.0, "epoch": 1.3205882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.3930096626281738, "kl": 0.0033116393024101853, "learning_rate": 6.602941176470587e-07, "loss": 3.323845157865435e-05, "reward": 0.643750011920929, "reward_std": 0.14500616490840912, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 488.5625, "completions/min_length": 409.0, "epoch": 1.3220588235294117, "frac_reward_zero_std": 0.0, "grad_norm": 1.6929080486297607, "kl": 0.003351237333845347, "learning_rate": 6.610294117647059e-07, "loss": 3.372877836227417e-05, "reward": 0.6553333401679993, "reward_std": 0.4203506112098694, "rewards/DrugCombAccuracyCOTORM/mean": 0.5900000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.4849192500114441, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5018484592437744, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 436.625, "completions/min_length": 367.0, "epoch": 1.3235294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.007069403771311045, "kl": 0.0023971267510205507, "learning_rate": 6.617647058823529e-07, "loss": 2.3948879970703274e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 494.4375, "completions/min_length": 430.0, "epoch": 1.325, "frac_reward_zero_std": 0.0, "grad_norm": 1.3795732259750366, "kl": 0.003335450019221753, "learning_rate": 6.624999999999999e-07, "loss": 3.330036997795105e-05, "reward": 0.8145833015441895, "reward_std": 0.3564217984676361, "rewards/DrugCombAccuracyCOTORM/mean": 0.7916666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.4013864994049072, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5439056158065796, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 440.25, "completions/min_length": 390.0, "epoch": 1.326470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.007347898092120886, "kl": 0.0027741240337491035, "learning_rate": 6.63235294117647e-07, "loss": 2.7702686566044576e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 488.1875, "completions/min_length": 410.0, "epoch": 1.3279411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.8077667951583862, "kl": 0.0020577773393597454, "learning_rate": 6.63970588235294e-07, "loss": 2.0481646060943604e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 457.125, "completions/min_length": 413.0, "epoch": 1.3294117647058823, "frac_reward_zero_std": 0.0, "grad_norm": 1.8535710573196411, "kl": 0.003822677885182202, "learning_rate": 6.647058823529411e-07, "loss": 3.8154423236846924e-05, "reward": 0.8374999761581421, "reward_std": 0.35143834352493286, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 463.125, "completions/min_length": 414.0, "epoch": 1.3308823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.02340080961585045, "kl": 0.0033912959625013173, "learning_rate": 6.654411764705882e-07, "loss": 3.448624920565635e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 453.25, "completions/min_length": 408.0, "epoch": 1.3323529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.3868045806884766, "kl": 0.002538345492212102, "learning_rate": 6.661764705882352e-07, "loss": 2.543628215789795e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 476.3125, "completions/min_length": 406.0, "epoch": 1.3338235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.013778542168438435, "kl": 0.003009117965120822, "learning_rate": 6.669117647058822e-07, "loss": 2.99274342978606e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 462.0625, "completions/min_length": 392.0, "epoch": 1.3352941176470587, "frac_reward_zero_std": 0.5, "grad_norm": 0.8997702598571777, "kl": 0.001965875009773299, "learning_rate": 6.676470588235294e-07, "loss": 1.9545361283235252e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 491.5625, "completions/min_length": 403.0, "epoch": 1.336764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.01734502986073494, "kl": 0.0027801793185062706, "learning_rate": 6.683823529411764e-07, "loss": 2.770583341771271e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 424.6875, "completions/min_length": 382.0, "epoch": 1.3382352941176472, "frac_reward_zero_std": 1.0, "grad_norm": 0.007122434210032225, "kl": 0.002544288639910519, "learning_rate": 6.691176470588234e-07, "loss": 2.5398687284905463e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/mean_length": 483.1875, "completions/min_length": 409.0, "epoch": 1.3397058823529413, "frac_reward_zero_std": 0.0, "grad_norm": 1.151944637298584, "kl": 0.002656546188518405, "learning_rate": 6.698529411764705e-07, "loss": 2.6695430278778076e-05, "reward": 0.5729166865348816, "reward_std": 0.3056872487068176, "rewards/DrugCombAccuracyCOTORM/mean": 0.5208333134651184, "rewards/DrugCombAccuracyCOTORM/std": 0.48638883233070374, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 494.25, "completions/min_length": 389.0, "epoch": 1.3411764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 1.1331998109817505, "kl": 0.003928895981516689, "learning_rate": 6.705882352941176e-07, "loss": 3.910372470272705e-05, "reward": 0.6797500252723694, "reward_std": 0.10986077785491943, "rewards/DrugCombAccuracyCOTORM/mean": 0.6179167032241821, "rewards/DrugCombAccuracyCOTORM/std": 0.4384600818157196, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.22669117152690887, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 442.25, "completions/min_length": 390.0, "epoch": 1.3426470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.04276999831199646, "kl": 0.003824856597930193, "learning_rate": 6.713235294117646e-07, "loss": 3.882752571371384e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 422.6875, "completions/min_length": 355.0, "epoch": 1.3441176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.00488799624145031, "kl": 0.002050808514468372, "learning_rate": 6.720588235294118e-07, "loss": 2.0435272745089605e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 483.25, "completions/min_length": 443.0, "epoch": 1.3455882352941178, "frac_reward_zero_std": 0.5, "grad_norm": 1.059385061264038, "kl": 0.00246783034526743, "learning_rate": 6.727941176470588e-07, "loss": 2.4730616132728755e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 450.625, "completions/min_length": 405.0, "epoch": 1.3470588235294119, "frac_reward_zero_std": 0.5, "grad_norm": 1.281197428703308, "kl": 0.004060940787894651, "learning_rate": 6.735294117647058e-07, "loss": 4.068969064974226e-05, "reward": 0.77715003490448, "reward_std": 0.18937557935714722, "rewards/DrugCombAccuracyCOTORM/mean": 0.7381786108016968, "rewards/DrugCombAccuracyCOTORM/std": 0.40540674328804016, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8660714626312256, "rewards/DrugCombCoverageCOTORM/std": 0.23761142790317535, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 466.1875, "completions/min_length": 368.0, "epoch": 1.348529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.044075749814510345, "kl": 0.0036686981911771, "learning_rate": 6.74264705882353e-07, "loss": 3.6851408367510885e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/mean_length": 478.9375, "completions/min_length": 352.0, "epoch": 1.35, "frac_reward_zero_std": 0.5, "grad_norm": 0.940744936466217, "kl": 0.0032500861561857164, "learning_rate": 6.75e-07, "loss": 3.207474946975708e-05, "reward": 0.5250000357627869, "reward_std": 0.1035098284482956, "rewards/DrugCombAccuracyCOTORM/mean": 0.40625, "rewards/DrugCombAccuracyCOTORM/std": 0.4552929699420929, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 445.0625, "completions/min_length": 380.0, "epoch": 1.3514705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.122127890586853, "kl": 0.003512648108880967, "learning_rate": 6.757352941176471e-07, "loss": 3.523502527968958e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 470.75, "completions/min_length": 428.0, "epoch": 1.3529411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.099929690361023, "kl": 0.0034754977677948773, "learning_rate": 6.764705882352941e-07, "loss": 3.4883814805652946e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/mean_length": 497.0625, "completions/min_length": 382.0, "epoch": 1.3544117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 1.510991096496582, "kl": 0.0029080326785333455, "learning_rate": 6.772058823529412e-07, "loss": 2.9146671295166016e-05, "reward": 0.6875, "reward_std": 0.20830950140953064, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 458.0, "completions/min_length": 414.0, "epoch": 1.3558823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.9949932098388672, "kl": 0.0023121205158531666, "learning_rate": 6.779411764705883e-07, "loss": 2.341822982998565e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/mean_length": 533.6875, "completions/min_length": 448.0, "epoch": 1.3573529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 1.109455943107605, "kl": 0.007025158032774925, "learning_rate": 6.786764705882353e-07, "loss": 6.824731826782227e-05, "reward": 0.9800000190734863, "reward_std": 0.037032779306173325, "rewards/DrugCombAccuracyCOTORM/mean": 0.9750000238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.06831300258636475, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 467.25, "completions/min_length": 416.0, "epoch": 1.3588235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.007126126904040575, "kl": 0.002571899618487805, "learning_rate": 6.794117647058823e-07, "loss": 2.5702920538606122e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 453.3125, "completions/min_length": 394.0, "epoch": 1.3602941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 0.9805792570114136, "kl": 0.002323253283975646, "learning_rate": 6.801470588235295e-07, "loss": 2.3286364012164995e-05, "reward": 0.75, "reward_std": 0.20701967179775238, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 448.625, "completions/min_length": 427.0, "epoch": 1.361764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9869259595870972, "kl": 0.002362737781368196, "learning_rate": 6.808823529411765e-07, "loss": 2.3696571588516235e-05, "reward": 0.8040624856948853, "reward_std": 0.16225165128707886, "rewards/DrugCombAccuracyCOTORM/mean": 0.7648437023162842, "rewards/DrugCombAccuracyCOTORM/std": 0.3602319657802582, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.921875, "rewards/DrugCombCoverageCOTORM/std": 0.11967839300632477, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 450.125, "completions/min_length": 358.0, "epoch": 1.363235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9951885938644409, "kl": 0.0022208598093129694, "learning_rate": 6.816176470588235e-07, "loss": 2.2179239749675617e-05, "reward": 0.8219166994094849, "reward_std": 0.1856452226638794, "rewards/DrugCombAccuracyCOTORM/mean": 0.8216666579246521, "rewards/DrugCombAccuracyCOTORM/std": 0.32173144817352295, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6458333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.6800735592842102, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 480.5, "completions/min_length": 402.0, "epoch": 1.3647058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.8611161708831787, "kl": 0.003859842719975859, "learning_rate": 6.823529411764706e-07, "loss": 3.8885373214725405e-05, "reward": 0.7171000242233276, "reward_std": 0.17460967600345612, "rewards/DrugCombAccuracyCOTORM/mean": 0.6744999885559082, "rewards/DrugCombAccuracyCOTORM/std": 0.43400004506111145, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7749999761581421, "rewards/DrugCombCoverageCOTORM/std": 0.30000001192092896, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 466.0, "completions/min_length": 357.0, "epoch": 1.3661764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9551085233688354, "kl": 0.0020952927006874233, "learning_rate": 6.830882352941176e-07, "loss": 2.081273305520881e-05, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 463.9375, "completions/min_length": 435.0, "epoch": 1.3676470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.013673465698957443, "kl": 0.0031651703757233918, "learning_rate": 6.838235294117647e-07, "loss": 3.171075877617113e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 407.625, "completions/min_length": 374.0, "epoch": 1.3691176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.009637534618377686, "kl": 0.0023688643414061517, "learning_rate": 6.845588235294118e-07, "loss": 2.3414584575220942e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 474.0, "completions/min_length": 368.0, "epoch": 1.3705882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.9773917198181152, "kl": 0.002737418340984732, "learning_rate": 6.852941176470588e-07, "loss": 2.717455936362967e-05, "reward": 0.5676249861717224, "reward_std": 0.031726229935884476, "rewards/DrugCombAccuracyCOTORM/mean": 0.5103124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.5073147416114807, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.59375, "rewards/DrugCombCoverageCOTORM/std": 0.4552929699420929, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 477.0625, "completions/min_length": 380.0, "epoch": 1.3720588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.007792353164404631, "kl": 0.0024681723152752966, "learning_rate": 6.860294117647058e-07, "loss": 2.4708548153284937e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 492.4375, "completions/min_length": 430.0, "epoch": 1.3735294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9854573011398315, "kl": 0.0025535807944834232, "learning_rate": 6.86764705882353e-07, "loss": 2.539902925491333e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 456.125, "completions/min_length": 402.0, "epoch": 1.375, "frac_reward_zero_std": 0.5, "grad_norm": 0.9812512397766113, "kl": 0.002598503837361932, "learning_rate": 6.875e-07, "loss": 2.5939196348190308e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 484.3125, "completions/min_length": 424.0, "epoch": 1.3764705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.2436988353729248, "kl": 0.002761014533462003, "learning_rate": 6.88235294117647e-07, "loss": 2.7179441531188786e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 466.5, "completions/min_length": 365.0, "epoch": 1.3779411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.010454985313117504, "kl": 0.002754752349574119, "learning_rate": 6.889705882352941e-07, "loss": 2.7800133466371335e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 465.5625, "completions/min_length": 410.0, "epoch": 1.3794117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.0119289830327034, "kl": 0.00293311954010278, "learning_rate": 6.897058823529411e-07, "loss": 2.9410461138468236e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 438.25, "completions/min_length": 372.0, "epoch": 1.3808823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.0077029261738061905, "kl": 0.002532734564738348, "learning_rate": 6.904411764705882e-07, "loss": 2.5197037757607177e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 403.375, "completions/min_length": 374.0, "epoch": 1.3823529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.004950407426804304, "kl": 0.0020224587060511112, "learning_rate": 6.911764705882353e-07, "loss": 2.0020177544211037e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 438.0625, "completions/min_length": 357.0, "epoch": 1.3838235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.007296810857951641, "kl": 0.0026234968099743128, "learning_rate": 6.919117647058823e-07, "loss": 2.5808612917899154e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 462.125, "completions/min_length": 376.0, "epoch": 1.3852941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.02426980622112751, "kl": 0.0035618403926491737, "learning_rate": 6.926470588235293e-07, "loss": 3.5957800719188526e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 430.875, "completions/min_length": 346.0, "epoch": 1.386764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9512307047843933, "kl": 0.0035914486506953835, "learning_rate": 6.933823529411765e-07, "loss": 3.555789589881897e-05, "reward": 0.699999988079071, "reward_std": 0.2507132589817047, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 455.5, "completions/min_length": 382.0, "epoch": 1.388235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0627140998840332, "kl": 0.0030463399598374963, "learning_rate": 6.941176470588235e-07, "loss": 3.041508716705721e-05, "reward": 0.9551249742507935, "reward_std": 0.12692566215991974, "rewards/DrugCombAccuracyCOTORM/mean": 0.9478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.20874999463558197, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 422.4375, "completions/min_length": 343.0, "epoch": 1.3897058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.015090398490428925, "kl": 0.0026129421312361956, "learning_rate": 6.948529411764705e-07, "loss": 2.6274992706021294e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 422.8125, "completions/min_length": 367.0, "epoch": 1.3911764705882352, "frac_reward_zero_std": 0.5, "grad_norm": 0.9489574432373047, "kl": 0.0023502562544308603, "learning_rate": 6.955882352941176e-07, "loss": 2.3715198040008545e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 464.1875, "completions/min_length": 394.0, "epoch": 1.3926470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 1.0877100229263306, "kl": 0.002429957239655778, "learning_rate": 6.963235294117647e-07, "loss": 2.4080276489257812e-05, "reward": 0.8500000238418579, "reward_std": 0.20701967179775238, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 428.375, "completions/min_length": 354.0, "epoch": 1.3941176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.006750334519892931, "kl": 0.0022905765217728913, "learning_rate": 6.970588235294117e-07, "loss": 2.271774792461656e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/mean_length": 472.4375, "completions/min_length": 341.0, "epoch": 1.3955882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.1600620746612549, "kl": 0.0028069853433407843, "learning_rate": 6.977941176470588e-07, "loss": 2.7897100153495558e-05, "reward": 0.6346666812896729, "reward_std": 0.04364939406514168, "rewards/DrugCombAccuracyCOTORM/mean": 0.5641666650772095, "rewards/DrugCombAccuracyCOTORM/std": 0.45625773072242737, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 448.3125, "completions/min_length": 371.0, "epoch": 1.3970588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 37.89308166503906, "kl": 0.0702131058787927, "learning_rate": 6.985294117647058e-07, "loss": 0.0007199771353043616, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/mean_length": 485.4375, "completions/min_length": 405.0, "epoch": 1.3985294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.05782519280910492, "kl": 0.0029036201594863087, "learning_rate": 6.992647058823528e-07, "loss": 2.8473654310801066e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/mean_length": 485.1875, "completions/min_length": 407.0, "epoch": 1.4, "frac_reward_zero_std": 0.0, "grad_norm": 1.4290218353271484, "kl": 0.0033082797890529037, "learning_rate": 7e-07, "loss": 3.2708048820495605e-05, "reward": 0.6412083506584167, "reward_std": 0.4044071435928345, "rewards/DrugCombAccuracyCOTORM/mean": 0.5762500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.49902406334877014, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8020833730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5063257217407227, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 446.8125, "completions/min_length": 385.0, "epoch": 1.401470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.0063579389825463295, "kl": 0.002424559701466933, "learning_rate": 7.00735294117647e-07, "loss": 2.4236780518549494e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 434.25, "completions/min_length": 384.0, "epoch": 1.4029411764705881, "frac_reward_zero_std": 1.0, "grad_norm": 0.008004292845726013, "kl": 0.002597381127998233, "learning_rate": 7.01470588235294e-07, "loss": 2.620113082230091e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 483.25, "completions/min_length": 393.0, "epoch": 1.4044117647058822, "frac_reward_zero_std": 0.5, "grad_norm": 0.6604078412055969, "kl": 0.002179878647439182, "learning_rate": 7.022058823529411e-07, "loss": 2.1722378733102232e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 438.1875, "completions/min_length": 389.0, "epoch": 1.4058823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.008311997167766094, "kl": 0.002670989080797881, "learning_rate": 7.029411764705882e-07, "loss": 2.658639095898252e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 424.25, "completions/min_length": 387.0, "epoch": 1.4073529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.006455659866333, "kl": 0.0023926544236019254, "learning_rate": 7.036764705882353e-07, "loss": 2.402053723926656e-05, "reward": 0.3500000238418579, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 446.8125, "completions/min_length": 415.0, "epoch": 1.4088235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.008578849025070667, "kl": 0.0025278821703977883, "learning_rate": 7.044117647058824e-07, "loss": 2.539723936934024e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/mean_length": 533.0625, "completions/min_length": 413.0, "epoch": 1.4102941176470587, "frac_reward_zero_std": 0.5, "grad_norm": 0.9101465940475464, "kl": 0.002376100397668779, "learning_rate": 7.051470588235294e-07, "loss": 2.3636432160856202e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 456.25, "completions/min_length": 422.0, "epoch": 1.4117647058823528, "frac_reward_zero_std": 1.0, "grad_norm": 0.012484506703913212, "kl": 0.0028686629375442863, "learning_rate": 7.058823529411765e-07, "loss": 2.8573133022291586e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/mean_length": 503.625, "completions/min_length": 444.0, "epoch": 1.413235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.3994152545928955, "kl": 0.004146770806983113, "learning_rate": 7.066176470588236e-07, "loss": 4.1872262954711914e-05, "reward": 0.800000011920929, "reward_std": 0.37032803893089294, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 423.4375, "completions/min_length": 353.0, "epoch": 1.4147058823529413, "frac_reward_zero_std": 0.0, "grad_norm": 1.7566536664962769, "kl": 0.003103378781815991, "learning_rate": 7.073529411764706e-07, "loss": 3.091245889663696e-05, "reward": 0.5648333430290222, "reward_std": 0.22381466627120972, "rewards/DrugCombAccuracyCOTORM/mean": 0.4925000071525574, "rewards/DrugCombAccuracyCOTORM/std": 0.46994325518608093, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7083333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.4849589467048645, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 456.75, "completions/min_length": 406.0, "epoch": 1.4161764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.007549012545496225, "kl": 0.0025637283688411117, "learning_rate": 7.080882352941176e-07, "loss": 2.5583032766007818e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 506.5, "completions/min_length": 437.0, "epoch": 1.4176470588235295, "frac_reward_zero_std": 0.0, "grad_norm": 1.2070999145507812, "kl": 0.0025589170400053263, "learning_rate": 7.088235294117647e-07, "loss": 2.5451183319091797e-05, "reward": 0.6000000238418579, "reward_std": 0.37032803893089294, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 477.4375, "completions/min_length": 398.0, "epoch": 1.4191176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.1040711402893066, "kl": 0.0027730544097721577, "learning_rate": 7.095588235294118e-07, "loss": 2.7880072593688965e-05, "reward": 0.746874988079071, "reward_std": 0.20977772772312164, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 519.875, "completions/min_length": 456.0, "epoch": 1.4205882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.0037678480148315, "kl": 0.002815919287968427, "learning_rate": 7.102941176470588e-07, "loss": 2.808417775668204e-05, "reward": 0.6956250071525574, "reward_std": 0.12307018041610718, "rewards/DrugCombAccuracyCOTORM/mean": 0.6435267925262451, "rewards/DrugCombAccuracyCOTORM/std": 0.4174967110157013, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8080357313156128, "rewards/DrugCombCoverageCOTORM/std": 0.22545304894447327, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/mean_length": 472.625, "completions/min_length": 407.0, "epoch": 1.4220588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.0318701267242432, "kl": 0.002669137960765511, "learning_rate": 7.110294117647059e-07, "loss": 2.6911140594165772e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 451.0, "completions/min_length": 354.0, "epoch": 1.423529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.01606256142258644, "kl": 0.0030149212980177253, "learning_rate": 7.117647058823529e-07, "loss": 3.0101769880275242e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 444.625, "completions/min_length": 410.0, "epoch": 1.425, "frac_reward_zero_std": 1.0, "grad_norm": 0.010779144242405891, "kl": 0.0026878952921833843, "learning_rate": 7.125e-07, "loss": 2.6903693651547655e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 468.25, "completions/min_length": 377.0, "epoch": 1.4264705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.0171808004379272, "kl": 0.002220753754954785, "learning_rate": 7.132352941176471e-07, "loss": 2.228282392024994e-05, "reward": 0.6610000133514404, "reward_std": 0.17885811626911163, "rewards/DrugCombAccuracyCOTORM/mean": 0.6179166436195374, "rewards/DrugCombAccuracyCOTORM/std": 0.47495320439338684, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.596284806728363, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 455.25, "completions/min_length": 406.0, "epoch": 1.4279411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.2819710969924927, "kl": 0.0028608939610421658, "learning_rate": 7.139705882352941e-07, "loss": 2.8681337425950915e-05, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 486.625, "completions/min_length": 441.0, "epoch": 1.4294117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.8151471018791199, "kl": 0.00255053723230958, "learning_rate": 7.147058823529411e-07, "loss": 2.5521414499962702e-05, "reward": 0.6866666674613953, "reward_std": 0.08944202214479446, "rewards/DrugCombAccuracyCOTORM/mean": 0.6291666626930237, "rewards/DrugCombAccuracyCOTORM/std": 0.40941423177719116, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.27216553688049316, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 443.25, "completions/min_length": 391.0, "epoch": 1.4308823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 1.7515332698822021, "kl": 0.0032918985234573483, "learning_rate": 7.154411764705882e-07, "loss": 3.273040056228638e-05, "reward": 0.6000000238418579, "reward_std": 0.2828426957130432, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 436.5, "completions/min_length": 371.0, "epoch": 1.4323529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.008903546258807182, "kl": 0.0024927295453380793, "learning_rate": 7.161764705882353e-07, "loss": 2.459523966535926e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/mean_length": 502.625, "completions/min_length": 446.0, "epoch": 1.4338235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.011432742699980736, "kl": 0.002641941129695624, "learning_rate": 7.169117647058823e-07, "loss": 2.6354209694545716e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/mean_length": 519.5625, "completions/min_length": 451.0, "epoch": 1.4352941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.8179839849472046, "kl": 0.003169218427501619, "learning_rate": 7.176470588235294e-07, "loss": 3.2141804695129395e-05, "reward": 0.5910624861717224, "reward_std": 0.4072108268737793, "rewards/DrugCombAccuracyCOTORM/mean": 0.5304948091506958, "rewards/DrugCombAccuracyCOTORM/std": 0.4796990156173706, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.6694387197494507, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/mean_length": 464.25, "completions/min_length": 405.0, "epoch": 1.436764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.1795439720153809, "kl": 0.0026300924364477396, "learning_rate": 7.183823529411764e-07, "loss": 2.6638299459591508e-05, "reward": 0.81291663646698, "reward_std": 0.2009839117527008, "rewards/DrugCombAccuracyCOTORM/mean": 0.784375011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.3865051865577698, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.27131369709968567, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 444.5, "completions/min_length": 371.0, "epoch": 1.438235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 9.01486587524414, "kl": 0.10754193895263597, "learning_rate": 7.191176470588235e-07, "loss": 0.0010877400636672974, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/mean_length": 507.625, "completions/min_length": 430.0, "epoch": 1.4397058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.396889090538025, "kl": 0.00247964751906693, "learning_rate": 7.198529411764706e-07, "loss": 2.466142177581787e-05, "reward": 0.5447916984558105, "reward_std": 0.3523290157318115, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.4669642150402069, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9479166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.145535409450531, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 480.3125, "completions/min_length": 425.0, "epoch": 1.4411764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.011193130165338516, "kl": 0.0028678998060058802, "learning_rate": 7.205882352941176e-07, "loss": 2.8973394364584237e-05, "reward": 0.6865000128746033, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6237499713897705, "rewards/DrugCombAccuracyCOTORM/std": 0.38858935236930847, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.12909944355487823, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/mean_length": 482.5625, "completions/min_length": 393.0, "epoch": 1.4426470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.2429492473602295, "kl": 0.004126646206714213, "learning_rate": 7.213235294117646e-07, "loss": 4.1015446186065674e-05, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 412.375, "completions/min_length": 307.0, "epoch": 1.4441176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.01938191056251526, "kl": 0.003252462425734848, "learning_rate": 7.220588235294118e-07, "loss": 3.24249776895158e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 467.1875, "completions/min_length": 374.0, "epoch": 1.4455882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.1421359777450562, "kl": 0.003382260329090059, "learning_rate": 7.227941176470588e-07, "loss": 3.3058226108551025e-05, "reward": 0.8999999761581421, "reward_std": 0.10690448433160782, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.22360680997371674, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 471.875, "completions/min_length": 414.0, "epoch": 1.4470588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.010225712321698666, "kl": 0.0026194595266133547, "learning_rate": 7.235294117647058e-07, "loss": 2.6212865122943185e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/mean_length": 503.5, "completions/min_length": 386.0, "epoch": 1.4485294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 1.2423514127731323, "kl": 0.0027098491555079818, "learning_rate": 7.242647058823529e-07, "loss": 2.7388334274291992e-05, "reward": 0.7749999761581421, "reward_std": 0.0707106813788414, "rewards/DrugCombAccuracyCOTORM/mean": 0.71875, "rewards/DrugCombAccuracyCOTORM/std": 0.3145764470100403, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 445.6875, "completions/min_length": 360.0, "epoch": 1.45, "frac_reward_zero_std": 1.0, "grad_norm": 0.0059827533550560474, "kl": 0.002168686012737453, "learning_rate": 7.249999999999999e-07, "loss": 2.15114723687293e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 452.0, "completions/min_length": 345.0, "epoch": 1.451470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.011818218044936657, "kl": 0.0024921307631302625, "learning_rate": 7.25735294117647e-07, "loss": 2.4945697077782825e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/mean_length": 552.6875, "completions/min_length": 496.0, "epoch": 1.4529411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.5895230770111084, "kl": 0.0031627558637410402, "learning_rate": 7.264705882352941e-07, "loss": 3.167241811752319e-05, "reward": 0.5062500238418579, "reward_std": 0.3442630469799042, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 457.0625, "completions/min_length": 408.0, "epoch": 1.4544117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.00640786113217473, "kl": 0.002319055434782058, "learning_rate": 7.272058823529411e-07, "loss": 2.3178778064902872e-05, "reward": 0.2409999966621399, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.08250000327825546, "rewards/DrugCombAccuracyCOTORM/std": 0.08520564436912537, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 510.75, "completions/min_length": 448.0, "epoch": 1.4558823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.005098789930343628, "kl": 0.0017449258884880692, "learning_rate": 7.279411764705881e-07, "loss": 1.7406646293238737e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 448.9375, "completions/min_length": 384.0, "epoch": 1.4573529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.010094820521771908, "kl": 0.002729828469455242, "learning_rate": 7.286764705882353e-07, "loss": 2.7313450118526816e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/mean_length": 475.9375, "completions/min_length": 395.0, "epoch": 1.4588235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 1.0321310758590698, "kl": 0.0023980025143828243, "learning_rate": 7.294117647058823e-07, "loss": 2.4139881134033203e-05, "reward": 0.574999988079071, "reward_std": 0.04629100486636162, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 442.5625, "completions/min_length": 399.0, "epoch": 1.4602941176470587, "frac_reward_zero_std": 0.5, "grad_norm": 0.9635889530181885, "kl": 0.0029633139492943883, "learning_rate": 7.301470588235293e-07, "loss": 2.9661345251952298e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 437.75, "completions/min_length": 377.0, "epoch": 1.461764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.00773506797850132, "kl": 0.0027940290747210383, "learning_rate": 7.308823529411764e-07, "loss": 2.79288251476828e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 480.625, "completions/min_length": 413.0, "epoch": 1.4632352941176472, "frac_reward_zero_std": 1.0, "grad_norm": 0.009070456959307194, "kl": 0.002307457529241219, "learning_rate": 7.316176470588234e-07, "loss": 2.287012102897279e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 455.0625, "completions/min_length": 389.0, "epoch": 1.4647058823529413, "frac_reward_zero_std": 1.0, "grad_norm": 0.00575670599937439, "kl": 0.0022335471003316343, "learning_rate": 7.323529411764705e-07, "loss": 2.2228156012715772e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 454.125, "completions/min_length": 386.0, "epoch": 1.4661764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.0064154197461903095, "kl": 0.0024519407597836107, "learning_rate": 7.330882352941176e-07, "loss": 2.4597873562015593e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 438.0625, "completions/min_length": 377.0, "epoch": 1.4676470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.0312211513519287, "kl": 0.0020399057248141617, "learning_rate": 7.338235294117646e-07, "loss": 2.0615332687157206e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 506.8125, "completions/min_length": 418.0, "epoch": 1.4691176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.3894129991531372, "kl": 0.00319976604077965, "learning_rate": 7.345588235294117e-07, "loss": 3.18475067615509e-05, "reward": 0.643750011920929, "reward_std": 0.40420976281166077, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 455.5, "completions/min_length": 410.0, "epoch": 1.4705882352941178, "frac_reward_zero_std": 0.5, "grad_norm": 1.1396727561950684, "kl": 0.002432687790133059, "learning_rate": 7.352941176470589e-07, "loss": 2.4400407710345462e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 466.875, "completions/min_length": 415.0, "epoch": 1.4720588235294119, "frac_reward_zero_std": 0.5, "grad_norm": 1.059206247329712, "kl": 0.002789029502309859, "learning_rate": 7.360294117647059e-07, "loss": 2.7898699045181274e-05, "reward": 0.824999988079071, "reward_std": 0.24348658323287964, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 430.3125, "completions/min_length": 367.0, "epoch": 1.473529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.006638956256210804, "kl": 0.0027176575385965407, "learning_rate": 7.367647058823529e-07, "loss": 2.754448723862879e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/mean_length": 502.4375, "completions/min_length": 446.0, "epoch": 1.475, "frac_reward_zero_std": 0.0, "grad_norm": 1.4278309345245361, "kl": 0.002905646455474198, "learning_rate": 7.375e-07, "loss": 2.8893351554870605e-05, "reward": 0.7553333044052124, "reward_std": 0.3920404314994812, "rewards/DrugCombAccuracyCOTORM/mean": 0.7150000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.4409988820552826, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.3442651927471161, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 514.125, "completions/min_length": 438.0, "epoch": 1.4764705882352942, "frac_reward_zero_std": 0.0, "grad_norm": 1.1936047077178955, "kl": 0.0023483470431528986, "learning_rate": 7.382352941176471e-07, "loss": 2.339482307434082e-05, "reward": 0.78125, "reward_std": 0.3743184804916382, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 419.625, "completions/min_length": 346.0, "epoch": 1.4779411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.2548458576202393, "kl": 0.002467646758304909, "learning_rate": 7.389705882352941e-07, "loss": 2.434849739074707e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/mean_length": 528.4375, "completions/min_length": 460.0, "epoch": 1.4794117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 1.2545585632324219, "kl": 0.002694028429687023, "learning_rate": 7.397058823529412e-07, "loss": 2.701207995414734e-05, "reward": 0.5773025751113892, "reward_std": 0.27198585867881775, "rewards/DrugCombAccuracyCOTORM/mean": 0.5278782248497009, "rewards/DrugCombAccuracyCOTORM/std": 0.393991082906723, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.550000011920929, "rewards/DrugCombCoverageCOTORM/std": 0.4000000059604645, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 433.5625, "completions/min_length": 376.0, "epoch": 1.4808823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.006158790551126003, "kl": 0.0025189282605424523, "learning_rate": 7.404411764705882e-07, "loss": 2.5111632567131892e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 495.875, "completions/min_length": 432.0, "epoch": 1.4823529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 1.146525502204895, "kl": 0.0026475029881112278, "learning_rate": 7.411764705882352e-07, "loss": 2.6263296604156494e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 417.75, "completions/min_length": 347.0, "epoch": 1.4838235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.006574097089469433, "kl": 0.002272335725137964, "learning_rate": 7.419117647058824e-07, "loss": 2.2787869966123253e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/mean_length": 497.8125, "completions/min_length": 433.0, "epoch": 1.4852941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.005792764015495777, "kl": 0.002152606757590547, "learning_rate": 7.426470588235294e-07, "loss": 2.1520896552829072e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 444.5625, "completions/min_length": 410.0, "epoch": 1.486764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.006940156687051058, "kl": 0.0023971151967998594, "learning_rate": 7.433823529411764e-07, "loss": 2.3908178263809532e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/mean_length": 580.0, "completions/min_length": 453.0, "epoch": 1.488235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0392144918441772, "kl": 0.0028744310839101672, "learning_rate": 7.441176470588235e-07, "loss": 2.904979555751197e-05, "reward": 0.5719742178916931, "reward_std": 0.019404420629143715, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7197420597076416, "rewards/DrugCombCoverageCOTORM/std": 0.3925132751464844, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/mean_length": 465.5625, "completions/min_length": 344.0, "epoch": 1.4897058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.7925833463668823, "kl": 0.0035441901709418744, "learning_rate": 7.448529411764706e-07, "loss": 3.5803204809781164e-05, "reward": 0.9089166522026062, "reward_std": 0.16972768306732178, "rewards/DrugCombAccuracyCOTORM/mean": 0.8887500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.30663496255874634, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 426.5625, "completions/min_length": 404.0, "epoch": 1.4911764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9103580117225647, "kl": 0.002823310496751219, "learning_rate": 7.455882352941176e-07, "loss": 2.8341117285890505e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 526.625, "completions/min_length": 441.0, "epoch": 1.4926470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.0849205255508423, "kl": 0.0022280239209067076, "learning_rate": 7.463235294117647e-07, "loss": 2.2508203983306885e-05, "reward": 0.9021778106689453, "reward_std": 0.0754433423280716, "rewards/DrugCombAccuracyCOTORM/mean": 0.9054999947547913, "rewards/DrugCombAccuracyCOTORM/std": 0.14589037001132965, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7777777910232544, "rewards/DrugCombCoverageCOTORM/std": 0.4969039857387543, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 456.8125, "completions/min_length": 390.0, "epoch": 1.4941176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.4715166091918945, "kl": 0.0032068174914456904, "learning_rate": 7.470588235294117e-07, "loss": 3.205612301826477e-05, "reward": 0.7437499761581421, "reward_std": 0.3729080259799957, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 433.5, "completions/min_length": 373.0, "epoch": 1.4955882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.005902322940528393, "kl": 0.00216710934182629, "learning_rate": 7.477941176470587e-07, "loss": 2.1697516785934567e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 428.375, "completions/min_length": 395.0, "epoch": 1.4970588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.216336727142334, "kl": 0.004398879362270236, "learning_rate": 7.485294117647059e-07, "loss": 4.3414533138275146e-05, "reward": 0.7945833206176758, "reward_std": 0.17010116577148438, "rewards/DrugCombAccuracyCOTORM/mean": 0.7562500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.3733965754508972, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.15957117080688477, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 475.5, "completions/min_length": 431.0, "epoch": 1.4985294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.004806699231266975, "kl": 0.002058016514638439, "learning_rate": 7.492647058823529e-07, "loss": 2.0530098481685854e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 458.5, "completions/min_length": 370.0, "epoch": 1.5, "frac_reward_zero_std": 0.0, "grad_norm": 1.5912950038909912, "kl": 0.0038464973331429064, "learning_rate": 7.5e-07, "loss": 3.851950168609619e-05, "reward": 0.38749998807907104, "reward_std": 0.2668103575706482, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 415.5625, "completions/min_length": 319.0, "epoch": 1.5014705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9695069193840027, "kl": 0.0025242171832360327, "learning_rate": 7.50735294117647e-07, "loss": 2.5429651941522025e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/mean_length": 415.9375, "completions/min_length": 387.0, "epoch": 1.5029411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.0054810428991913795, "kl": 0.002186810423154384, "learning_rate": 7.514705882352941e-07, "loss": 2.1894240489928052e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 450.8125, "completions/min_length": 399.0, "epoch": 1.5044117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.006741054821759462, "kl": 0.0020038969232700765, "learning_rate": 7.522058823529412e-07, "loss": 2.0167255570413545e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 415.25, "completions/min_length": 343.0, "epoch": 1.5058823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.007298414595425129, "kl": 0.002831577730830759, "learning_rate": 7.529411764705882e-07, "loss": 2.818186658259947e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 433.625, "completions/min_length": 375.0, "epoch": 1.5073529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.014928018674254417, "kl": 0.0025952577125281096, "learning_rate": 7.536764705882352e-07, "loss": 2.5752093279152177e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 443.3125, "completions/min_length": 396.0, "epoch": 1.5088235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.3627806901931763, "kl": 0.0031800440046936274, "learning_rate": 7.544117647058824e-07, "loss": 3.173947334289551e-05, "reward": 0.4234375059604645, "reward_std": 0.26981136202812195, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.25, "rewards/DrugCombCoverageCOTORM/std": 0.9309493899345398, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 464.0625, "completions/min_length": 385.0, "epoch": 1.5102941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.013576578348875046, "kl": 0.003006646176800132, "learning_rate": 7.551470588235294e-07, "loss": 3.020509575435426e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 477.875, "completions/min_length": 425.0, "epoch": 1.511764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.1676286458969116, "kl": 0.002719848678680137, "learning_rate": 7.558823529411764e-07, "loss": 2.7329015210852958e-05, "reward": 0.6824166774749756, "reward_std": 0.13465391099452972, "rewards/DrugCombAccuracyCOTORM/mean": 0.6292856931686401, "rewards/DrugCombAccuracyCOTORM/std": 0.4378258287906647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7898809909820557, "rewards/DrugCombCoverageCOTORM/std": 0.27306318283081055, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/mean_length": 459.375, "completions/min_length": 375.0, "epoch": 1.513235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0038142204284668, "kl": 0.0025383176398463547, "learning_rate": 7.566176470588235e-07, "loss": 2.523418515920639e-05, "reward": 0.637499988079071, "reward_std": 0.22638463973999023, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 498.3125, "completions/min_length": 441.0, "epoch": 1.5147058823529411, "frac_reward_zero_std": 0.0, "grad_norm": 1.9825279712677002, "kl": 0.0031446890789084136, "learning_rate": 7.573529411764705e-07, "loss": 3.14563512802124e-05, "reward": 0.8791666626930237, "reward_std": 0.2491552233695984, "rewards/DrugCombAccuracyCOTORM/mean": 0.8645833730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.2803354561328888, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.28867512941360474, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 437.0, "completions/min_length": 389.0, "epoch": 1.5161764705882352, "frac_reward_zero_std": 0.5, "grad_norm": 0.8998625874519348, "kl": 0.002720544289331883, "learning_rate": 7.580882352941176e-07, "loss": 2.719866097322665e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 471.125, "completions/min_length": 378.0, "epoch": 1.5176470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.006986006163060665, "kl": 0.002695404691621661, "learning_rate": 7.588235294117647e-07, "loss": 2.6708448785939254e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 415.0, "completions/min_length": 345.0, "epoch": 1.5191176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.012958290986716747, "kl": 0.0029739229939877987, "learning_rate": 7.595588235294117e-07, "loss": 2.9022285161772743e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 420.1875, "completions/min_length": 363.0, "epoch": 1.5205882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.014332841150462627, "kl": 0.0026266255299560726, "learning_rate": 7.602941176470587e-07, "loss": 2.6337922463426366e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/mean_length": 520.125, "completions/min_length": 455.0, "epoch": 1.5220588235294117, "frac_reward_zero_std": 0.0, "grad_norm": 1.5465949773788452, "kl": 0.002908638503868133, "learning_rate": 7.610294117647059e-07, "loss": 2.915412187576294e-05, "reward": 0.31939637660980225, "reward_std": 0.27936050295829773, "rewards/DrugCombAccuracyCOTORM/mean": 0.17372465133666992, "rewards/DrugCombAccuracyCOTORM/std": 0.3318268358707428, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8041666746139526, "rewards/DrugCombCoverageCOTORM/std": 0.2390955090522766, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 414.4375, "completions/min_length": 347.0, "epoch": 1.5235294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.8959080576896667, "kl": 0.0019826739735435694, "learning_rate": 7.617647058823529e-07, "loss": 1.992591751331929e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 449.0, "completions/min_length": 409.0, "epoch": 1.525, "frac_reward_zero_std": 1.0, "grad_norm": 0.005981252063065767, "kl": 0.002479981747455895, "learning_rate": 7.624999999999999e-07, "loss": 2.4594797650934197e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 458.3125, "completions/min_length": 407.0, "epoch": 1.526470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.3172212839126587, "kl": 0.0031218923977576196, "learning_rate": 7.63235294117647e-07, "loss": 3.1335774110630155e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 443.75, "completions/min_length": 405.0, "epoch": 1.5279411764705881, "frac_reward_zero_std": 0.5, "grad_norm": 0.8612820506095886, "kl": 0.002469711209414527, "learning_rate": 7.63970588235294e-07, "loss": 2.4667911930009723e-05, "reward": 0.6088333129882812, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.5475000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.41562002897262573, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7083333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.4849589467048645, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 435.125, "completions/min_length": 408.0, "epoch": 1.5294117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.018912864848971367, "kl": 0.0025033456622622907, "learning_rate": 7.647058823529411e-07, "loss": 2.5254312276956625e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 433.8125, "completions/min_length": 374.0, "epoch": 1.5308823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.0067367288284003735, "kl": 0.0024663943331688643, "learning_rate": 7.654411764705882e-07, "loss": 2.451405998726841e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 443.125, "completions/min_length": 387.0, "epoch": 1.5323529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.0975247621536255, "kl": 0.002099747711326927, "learning_rate": 7.661764705882353e-07, "loss": 2.08243727684021e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 417.6875, "completions/min_length": 370.0, "epoch": 1.5338235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.005145425442606211, "kl": 0.0022657347726635635, "learning_rate": 7.669117647058823e-07, "loss": 2.2773969249101356e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/mean_length": 403.25, "completions/min_length": 360.0, "epoch": 1.5352941176470587, "frac_reward_zero_std": 1.0, "grad_norm": 0.006459196098148823, "kl": 0.002175323781557381, "learning_rate": 7.676470588235295e-07, "loss": 2.1642681531375274e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/mean_length": 537.4375, "completions/min_length": 468.0, "epoch": 1.5367647058823528, "frac_reward_zero_std": 0.0, "grad_norm": 1.5695706605911255, "kl": 0.0030864812433719635, "learning_rate": 7.683823529411765e-07, "loss": 3.0666589736938477e-05, "reward": 0.8007215857505798, "reward_std": 0.34068500995635986, "rewards/DrugCombAccuracyCOTORM/mean": 0.7571519613265991, "rewards/DrugCombAccuracyCOTORM/std": 0.4025583267211914, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9500000476837158, "rewards/DrugCombCoverageCOTORM/std": 0.1549193412065506, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 463.6875, "completions/min_length": 376.0, "epoch": 1.538235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 17724.6875, "kl": 400.2125509819016, "learning_rate": 7.691176470588235e-07, "loss": 4.292599678039551, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 489.5, "completions/min_length": 424.0, "epoch": 1.539705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.089092493057251, "kl": 0.0024080215371213853, "learning_rate": 7.698529411764706e-07, "loss": 2.400505036348477e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 414.125, "completions/min_length": 377.0, "epoch": 1.5411764705882351, "frac_reward_zero_std": 1.0, "grad_norm": 0.016301365569233894, "kl": 0.002592123782960698, "learning_rate": 7.705882352941177e-07, "loss": 2.5563367671566084e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 432.0, "completions/min_length": 348.0, "epoch": 1.5426470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.006240446120500565, "kl": 0.002146699669538066, "learning_rate": 7.713235294117647e-07, "loss": 2.1465542886289768e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 435.75, "completions/min_length": 392.0, "epoch": 1.5441176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.014854279346764088, "kl": 0.0032699561561457813, "learning_rate": 7.720588235294118e-07, "loss": 3.1972758733900264e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/mean_length": 513.3125, "completions/min_length": 464.0, "epoch": 1.5455882352941175, "frac_reward_zero_std": 0.5, "grad_norm": 1.1178078651428223, "kl": 0.0037603845121338964, "learning_rate": 7.727941176470588e-07, "loss": 3.74397641280666e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 472.0625, "completions/min_length": 394.0, "epoch": 1.5470588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.159602403640747, "kl": 0.002254900988191366, "learning_rate": 7.735294117647058e-07, "loss": 2.2485852241516113e-05, "reward": 0.8312499523162842, "reward_std": 0.36740854382514954, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 464.9375, "completions/min_length": 425.0, "epoch": 1.548529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.006620368920266628, "kl": 0.002363564388360828, "learning_rate": 7.74264705882353e-07, "loss": 2.3725864593870938e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 448.9375, "completions/min_length": 378.0, "epoch": 1.55, "frac_reward_zero_std": 1.0, "grad_norm": 0.04092270880937576, "kl": 0.004177177412202582, "learning_rate": 7.75e-07, "loss": 4.123144753975794e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 437.625, "completions/min_length": 389.0, "epoch": 1.5514705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.8475285768508911, "kl": 0.004555487306788564, "learning_rate": 7.75735294117647e-07, "loss": 4.5999884605407715e-05, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 443.625, "completions/min_length": 391.0, "epoch": 1.5529411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.2180759906768799, "kl": 0.003420511318836361, "learning_rate": 7.764705882352941e-07, "loss": 3.499697049846873e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 467.0, "completions/min_length": 388.0, "epoch": 1.5544117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.2089245319366455, "kl": 0.005170932621695101, "learning_rate": 7.772058823529412e-07, "loss": 5.296948802424595e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 444.25, "completions/min_length": 370.0, "epoch": 1.5558823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.004980865400284529, "kl": 0.0024388859746977687, "learning_rate": 7.779411764705882e-07, "loss": 2.432483233860694e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 461.75, "completions/min_length": 389.0, "epoch": 1.5573529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.009137419052422047, "kl": 0.0030093201203271747, "learning_rate": 7.786764705882353e-07, "loss": 3.0124676413834095e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 432.4375, "completions/min_length": 391.0, "epoch": 1.5588235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.005054599605500698, "kl": 0.00234615191584453, "learning_rate": 7.794117647058823e-07, "loss": 2.343934465898201e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/mean_length": 511.3125, "completions/min_length": 435.0, "epoch": 1.5602941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.1032155752182007, "kl": 0.0030976796988397837, "learning_rate": 7.801470588235293e-07, "loss": 3.089585152338259e-05, "reward": 0.3500000238418579, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/mean_length": 499.0, "completions/min_length": 417.0, "epoch": 1.561764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 2.010171890258789, "kl": 0.003402252506930381, "learning_rate": 7.808823529411765e-07, "loss": 3.409385681152344e-05, "reward": 0.28183335065841675, "reward_std": 0.27514076232910156, "rewards/DrugCombAccuracyCOTORM/mean": 0.18041667342185974, "rewards/DrugCombAccuracyCOTORM/std": 0.3625871539115906, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.5426273345947266, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 466.25, "completions/min_length": 414.0, "epoch": 1.563235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.008417599834501743, "kl": 0.003177437523845583, "learning_rate": 7.816176470588235e-07, "loss": 3.1691768526798114e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/mean_length": 479.125, "completions/min_length": 328.0, "epoch": 1.5647058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.9849303364753723, "kl": 0.002565448288805783, "learning_rate": 7.823529411764705e-07, "loss": 2.5532233848934993e-05, "reward": 0.9333333373069763, "reward_std": 0.07126964628696442, "rewards/DrugCombAccuracyCOTORM/mean": 0.9166666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.14907118678092957, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 484.0, "completions/min_length": 424.0, "epoch": 1.5661764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.007499489933252335, "kl": 0.002645245665917173, "learning_rate": 7.830882352941176e-07, "loss": 2.634569136716891e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 433.3125, "completions/min_length": 368.0, "epoch": 1.5676470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.1387286186218262, "kl": 0.0032589257461950183, "learning_rate": 7.838235294117647e-07, "loss": 3.2573938369750977e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 453.3125, "completions/min_length": 416.0, "epoch": 1.5691176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.007930893450975418, "kl": 0.002631100360304117, "learning_rate": 7.845588235294117e-07, "loss": 2.6322957637603395e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/mean_length": 526.9375, "completions/min_length": 466.0, "epoch": 1.5705882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.010262370109558105, "kl": 0.00262748368550092, "learning_rate": 7.852941176470588e-07, "loss": 2.6436278858454898e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/mean_length": 483.6875, "completions/min_length": 386.0, "epoch": 1.5720588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.0032130479812622, "kl": 0.002205566532211378, "learning_rate": 7.860294117647058e-07, "loss": 2.2326348698697984e-05, "reward": 0.9096875190734863, "reward_std": 0.13398922979831696, "rewards/DrugCombAccuracyCOTORM/mean": 0.8968750238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.23302032053470612, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.921875, "rewards/DrugCombCoverageCOTORM/std": 0.19116783142089844, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 470.8125, "completions/min_length": 387.0, "epoch": 1.5735294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.02520173043012619, "kl": 0.0035288897925056517, "learning_rate": 7.867647058823529e-07, "loss": 3.394000668777153e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 447.5, "completions/min_length": 401.0, "epoch": 1.575, "frac_reward_zero_std": 1.0, "grad_norm": 0.009344683960080147, "kl": 0.0026610749191604555, "learning_rate": 7.875e-07, "loss": 2.6706005883170292e-05, "reward": 0.6713333129882812, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6100000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.40279027819633484, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 433.5, "completions/min_length": 387.0, "epoch": 1.576470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.005946914199739695, "kl": 0.002258930413518101, "learning_rate": 7.88235294117647e-07, "loss": 2.2543852537637576e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 476.25, "completions/min_length": 409.0, "epoch": 1.5779411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.012415158562362194, "kl": 0.003220881859306246, "learning_rate": 7.88970588235294e-07, "loss": 3.2160591217689216e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 452.0625, "completions/min_length": 404.0, "epoch": 1.5794117647058825, "frac_reward_zero_std": 1.0, "grad_norm": 0.00987507589161396, "kl": 0.002865398768335581, "learning_rate": 7.897058823529411e-07, "loss": 2.8536147510749288e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 440.875, "completions/min_length": 383.0, "epoch": 1.5808823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.006750618573278189, "kl": 0.002429762447718531, "learning_rate": 7.904411764705882e-07, "loss": 2.406955354672391e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 453.5, "completions/min_length": 385.0, "epoch": 1.5823529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.007457830477505922, "kl": 0.0027492533554323018, "learning_rate": 7.911764705882352e-07, "loss": 2.738140756264329e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 432.6875, "completions/min_length": 336.0, "epoch": 1.5838235294117649, "frac_reward_zero_std": 0.5, "grad_norm": 1.0431787967681885, "kl": 0.0029117366357240826, "learning_rate": 7.919117647058823e-07, "loss": 2.91008018393768e-05, "reward": 0.824999988079071, "reward_std": 0.24348658323287964, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 428.25, "completions/min_length": 373.0, "epoch": 1.585294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.01954193040728569, "kl": 0.003015766851603985, "learning_rate": 7.926470588235293e-07, "loss": 2.9880333386245184e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/mean_length": 383.5625, "completions/min_length": 339.0, "epoch": 1.586764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9216882586479187, "kl": 0.0033840991090983152, "learning_rate": 7.933823529411764e-07, "loss": 3.377497341716662e-05, "reward": 0.5839166641235352, "reward_std": 0.17849332094192505, "rewards/DrugCombAccuracyCOTORM/mean": 0.5762500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.49902406334877014, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.2291666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.9867174029350281, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 444.875, "completions/min_length": 382.0, "epoch": 1.5882352941176472, "frac_reward_zero_std": 0.5, "grad_norm": 1.0394365787506104, "kl": 0.0030057529220357537, "learning_rate": 7.941176470588235e-07, "loss": 2.9675662517547607e-05, "reward": 0.731249988079071, "reward_std": 0.22350695729255676, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 435.0, "completions/min_length": 389.0, "epoch": 1.5897058823529413, "frac_reward_zero_std": 1.0, "grad_norm": 0.006352863274514675, "kl": 0.002243869414087385, "learning_rate": 7.948529411764705e-07, "loss": 2.2208852897165343e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 461.5625, "completions/min_length": 405.0, "epoch": 1.5911764705882354, "frac_reward_zero_std": 0.0, "grad_norm": 1.855718970298767, "kl": 0.003352325817104429, "learning_rate": 7.955882352941175e-07, "loss": 3.345310688018799e-05, "reward": 0.5562499761581421, "reward_std": 0.3005203604698181, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 505.1875, "completions/min_length": 431.0, "epoch": 1.5926470588235295, "frac_reward_zero_std": 0.0, "grad_norm": 1.5663189888000488, "kl": 0.0034012534888461232, "learning_rate": 7.963235294117646e-07, "loss": 3.395974636077881e-05, "reward": 0.7250000238418579, "reward_std": 0.38195645809173584, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 475.6875, "completions/min_length": 421.0, "epoch": 1.5941176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.9319671988487244, "kl": 0.004667088738642633, "learning_rate": 7.970588235294118e-07, "loss": 4.567781797959469e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 469.0, "completions/min_length": 391.0, "epoch": 1.5955882352941178, "frac_reward_zero_std": 0.5, "grad_norm": 0.9647791385650635, "kl": 0.00316156807821244, "learning_rate": 7.977941176470588e-07, "loss": 3.1497540476266295e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 474.5625, "completions/min_length": 398.0, "epoch": 1.5970588235294119, "frac_reward_zero_std": 0.5, "grad_norm": 0.9730669856071472, "kl": 0.003219445061404258, "learning_rate": 7.985294117647059e-07, "loss": 3.2357871532440186e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/mean_length": 389.625, "completions/min_length": 364.0, "epoch": 1.598529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.017939826473593712, "kl": 0.002977569994982332, "learning_rate": 7.992647058823529e-07, "loss": 2.968457374663558e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 485.4375, "completions/min_length": 391.0, "epoch": 1.6, "frac_reward_zero_std": 0.5, "grad_norm": 0.9468680024147034, "kl": 0.0033129682415165007, "learning_rate": 8e-07, "loss": 3.306195139884949e-05, "reward": 0.7767791748046875, "reward_std": 0.16047655045986176, "rewards/DrugCombAccuracyCOTORM/mean": 0.7437604069709778, "rewards/DrugCombAccuracyCOTORM/std": 0.35560956597328186, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8177083730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5002025365829468, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 477.5625, "completions/min_length": 407.0, "epoch": 1.6014705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 0.9671849608421326, "kl": 0.0026946563739329576, "learning_rate": 8.007352941176471e-07, "loss": 2.6967376470565796e-05, "reward": 0.940625011920929, "reward_std": 0.0819452702999115, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.13437095284461975, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.20155644416809082, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 415.25, "completions/min_length": 355.0, "epoch": 1.6029411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.012159550562500954, "kl": 0.002441892691422254, "learning_rate": 8.014705882352941e-07, "loss": 2.4306536943186074e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 471.625, "completions/min_length": 387.0, "epoch": 1.6044117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.238144874572754, "kl": 0.0029925231356173754, "learning_rate": 8.022058823529411e-07, "loss": 2.994356327690184e-05, "reward": 0.5693333148956299, "reward_std": 0.06306761503219604, "rewards/DrugCombAccuracyCOTORM/mean": 0.5137500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.5050000548362732, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5833333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.7934920787811279, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/mean_length": 533.0, "completions/min_length": 478.0, "epoch": 1.6058823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 1.844713807106018, "kl": 0.0038058391655795276, "learning_rate": 8.029411764705883e-07, "loss": 3.8251280784606934e-05, "reward": 0.6797083616256714, "reward_std": 0.33823251724243164, "rewards/DrugCombAccuracyCOTORM/mean": 0.6686458587646484, "rewards/DrugCombAccuracyCOTORM/std": 0.4308859407901764, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4479166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.8750000596046448, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 484.375, "completions/min_length": 427.0, "epoch": 1.6073529411764707, "frac_reward_zero_std": 0.0, "grad_norm": 1.7253530025482178, "kl": 0.00416848057648167, "learning_rate": 8.036764705882353e-07, "loss": 4.149973392486572e-05, "reward": 0.5541666746139526, "reward_std": 0.24192485213279724, "rewards/DrugCombAccuracyCOTORM/mean": 0.5208333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.48638883233070374, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 483.5625, "completions/min_length": 397.0, "epoch": 1.6088235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.016233598813414574, "kl": 0.0033581023453734815, "learning_rate": 8.044117647058823e-07, "loss": 3.307005681563169e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/mean_length": 480.8125, "completions/min_length": 405.0, "epoch": 1.6102941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 1.040862798690796, "kl": 0.003020649019163102, "learning_rate": 8.051470588235294e-07, "loss": 2.9810704290866852e-05, "reward": 0.629687488079071, "reward_std": 0.22892777621746063, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3125, "rewards/DrugCombCoverageCOTORM/std": 0.9464847445487976, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 441.25, "completions/min_length": 404.0, "epoch": 1.611764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.2798640727996826, "kl": 0.003250640584155917, "learning_rate": 8.058823529411764e-07, "loss": 3.243982791900635e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/mean_length": 463.3125, "completions/min_length": 380.0, "epoch": 1.613235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1643450260162354, "kl": 0.003142144123557955, "learning_rate": 8.066176470588235e-07, "loss": 3.116202788078226e-05, "reward": 0.6153125166893005, "reward_std": 0.08732572197914124, "rewards/DrugCombAccuracyCOTORM/mean": 0.5562499761581421, "rewards/DrugCombAccuracyCOTORM/std": 0.4802343249320984, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.703125, "rewards/DrugCombCoverageCOTORM/std": 0.33189794421195984, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 435.6875, "completions/min_length": 339.0, "epoch": 1.6147058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.0378483533859253, "kl": 0.0024105324409902096, "learning_rate": 8.073529411764706e-07, "loss": 2.3850683646742254e-05, "reward": 0.625, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.25, "rewards/DrugCombCoverageCOTORM/std": 1.0, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 511.75, "completions/min_length": 436.0, "epoch": 1.6161764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.2833237648010254, "kl": 0.0025249430327676237, "learning_rate": 8.080882352941176e-07, "loss": 2.5220215320587158e-05, "reward": 0.5773155093193054, "reward_std": 0.22326932847499847, "rewards/DrugCombAccuracyCOTORM/mean": 0.5094047784805298, "rewards/DrugCombAccuracyCOTORM/std": 0.29399728775024414, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6979166269302368, "rewards/DrugCombCoverageCOTORM/std": 0.49523189663887024, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 469.4375, "completions/min_length": 392.0, "epoch": 1.6176470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.4756848812103271, "kl": 0.0029263601172715425, "learning_rate": 8.088235294117646e-07, "loss": 2.919137477874756e-05, "reward": 0.9026666879653931, "reward_std": 0.2753002643585205, "rewards/DrugCombAccuracyCOTORM/mean": 0.8887500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.30663496255874634, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 457.75, "completions/min_length": 385.0, "epoch": 1.6191176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.001667857170105, "kl": 0.003735692473128438, "learning_rate": 8.095588235294118e-07, "loss": 3.746151924133301e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 449.1875, "completions/min_length": 380.0, "epoch": 1.6205882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.017917951568961143, "kl": 0.004156217502895743, "learning_rate": 8.102941176470588e-07, "loss": 4.1301515011582524e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 438.6875, "completions/min_length": 375.0, "epoch": 1.6220588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.006401123013347387, "kl": 0.002107574633555487, "learning_rate": 8.110294117647058e-07, "loss": 2.1184336219448596e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 439.75, "completions/min_length": 395.0, "epoch": 1.6235294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.010208752937614918, "kl": 0.0031872314866632223, "learning_rate": 8.117647058823529e-07, "loss": 3.180462590535171e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 435.6875, "completions/min_length": 399.0, "epoch": 1.625, "frac_reward_zero_std": 0.5, "grad_norm": 0.8886079788208008, "kl": 0.00544597051339224, "learning_rate": 8.125e-07, "loss": 5.3464445954887196e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 450.1875, "completions/min_length": 393.0, "epoch": 1.6264705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.0325680710375309, "kl": 0.004086444969289005, "learning_rate": 8.13235294117647e-07, "loss": 4.0265604184241965e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/mean_length": 497.25, "completions/min_length": 361.0, "epoch": 1.6279411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.7996136546134949, "kl": 0.002042023668764159, "learning_rate": 8.139705882352941e-07, "loss": 2.047353336820379e-05, "reward": 0.9833333492279053, "reward_std": 0.047140445560216904, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 483.8125, "completions/min_length": 425.0, "epoch": 1.6294117647058823, "frac_reward_zero_std": 0.0, "grad_norm": 1.4551870822906494, "kl": 0.002469105878844857, "learning_rate": 8.147058823529411e-07, "loss": 2.4732202291488647e-05, "reward": 0.8208958506584167, "reward_std": 0.220864400267601, "rewards/DrugCombAccuracyCOTORM/mean": 0.7858854532241821, "rewards/DrugCombAccuracyCOTORM/std": 0.34341874718666077, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.921875, "rewards/DrugCombCoverageCOTORM/std": 0.14099103212356567, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/mean_length": 456.6875, "completions/min_length": 360.0, "epoch": 1.6308823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.3377095460891724, "kl": 0.00349610616103746, "learning_rate": 8.154411764705881e-07, "loss": 3.43967585649807e-05, "reward": 0.7416666746139526, "reward_std": 0.20266088843345642, "rewards/DrugCombAccuracyCOTORM/mean": 0.7083333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.4367387592792511, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/mean_length": 523.1875, "completions/min_length": 468.0, "epoch": 1.6323529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.3173614740371704, "kl": 0.002486452751327306, "learning_rate": 8.161764705882353e-07, "loss": 2.51084566116333e-05, "reward": 0.9366071820259094, "reward_std": 0.16019666194915771, "rewards/DrugCombAccuracyCOTORM/mean": 0.9285714626312256, "rewards/DrugCombAccuracyCOTORM/std": 0.2501700222492218, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 447.625, "completions/min_length": 404.0, "epoch": 1.6338235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9784667491912842, "kl": 0.0027331039891578257, "learning_rate": 8.169117647058823e-07, "loss": 2.712339482968673e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 412.625, "completions/min_length": 340.0, "epoch": 1.6352941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.005226373206824064, "kl": 0.002440029929857701, "learning_rate": 8.176470588235293e-07, "loss": 2.4368669983232394e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/mean_length": 388.0625, "completions/min_length": 323.0, "epoch": 1.636764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.06962913274765015, "kl": 0.005404387833550572, "learning_rate": 8.183823529411764e-07, "loss": 5.492034324561246e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 478.3125, "completions/min_length": 402.0, "epoch": 1.638235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.019731836393475533, "kl": 0.004608541552443057, "learning_rate": 8.191176470588235e-07, "loss": 4.564027767628431e-05, "reward": 0.4000000059604645, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 473.8125, "completions/min_length": 373.0, "epoch": 1.6397058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.005112020764499903, "kl": 0.002104107610648498, "learning_rate": 8.198529411764705e-07, "loss": 2.0934437998221256e-05, "reward": 0.9428571462631226, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.9285714626312256, "rewards/DrugCombAccuracyCOTORM/std": 0.07377111166715622, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 453.4375, "completions/min_length": 404.0, "epoch": 1.6411764705882352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0095375282689929, "kl": 0.002652492868946865, "learning_rate": 8.205882352941176e-07, "loss": 2.650166061357595e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 476.875, "completions/min_length": 445.0, "epoch": 1.6426470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 1.146573781967163, "kl": 0.003917861613444984, "learning_rate": 8.213235294117646e-07, "loss": 3.945082426071167e-05, "reward": 0.8302083611488342, "reward_std": 0.07013839483261108, "rewards/DrugCombAccuracyCOTORM/mean": 0.7916666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.24720662832260132, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 432.5, "completions/min_length": 363.0, "epoch": 1.6441176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.035767912864685, "kl": 0.004055011842865497, "learning_rate": 8.220588235294116e-07, "loss": 4.09930944442749e-05, "reward": 0.5874999761581421, "reward_std": 0.0353553369641304, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 469.5, "completions/min_length": 401.0, "epoch": 1.6455882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.08377480506896973, "kl": 0.003393461462110281, "learning_rate": 8.227941176470588e-07, "loss": 3.4248656447744e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/mean_length": 528.875, "completions/min_length": 383.0, "epoch": 1.6470588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.8414080142974854, "kl": 0.002922877436503768, "learning_rate": 8.235294117647058e-07, "loss": 2.9219612770248204e-05, "reward": 0.9361000061035156, "reward_std": 0.08819037675857544, "rewards/DrugCombAccuracyCOTORM/mean": 0.9248124957084656, "rewards/DrugCombAccuracyCOTORM/std": 0.16164828836917877, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9624999761581421, "rewards/DrugCombCoverageCOTORM/std": 0.08062257617712021, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 429.625, "completions/min_length": 334.0, "epoch": 1.6485294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 1.1541478633880615, "kl": 0.002790730562992394, "learning_rate": 8.242647058823528e-07, "loss": 2.7572084945859388e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 446.5625, "completions/min_length": 391.0, "epoch": 1.65, "frac_reward_zero_std": 1.0, "grad_norm": 0.04456803575158119, "kl": 0.0036997452843934298, "learning_rate": 8.249999999999999e-07, "loss": 3.735832069651224e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 482.625, "completions/min_length": 423.0, "epoch": 1.651470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.7710276246070862, "kl": 0.003443434543441981, "learning_rate": 8.25735294117647e-07, "loss": 3.4011900424957275e-05, "reward": 0.44999998807907104, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 427.5625, "completions/min_length": 390.0, "epoch": 1.6529411764705881, "frac_reward_zero_std": 0.5, "grad_norm": 1.10500967502594, "kl": 0.0025745134335011244, "learning_rate": 8.264705882352941e-07, "loss": 2.565922295616474e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 465.0, "completions/min_length": 403.0, "epoch": 1.6544117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.015825456008315086, "kl": 0.0040918406448327005, "learning_rate": 8.272058823529411e-07, "loss": 4.079766949871555e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/mean_length": 548.375, "completions/min_length": 465.0, "epoch": 1.6558823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 68039.15625, "kl": 1439.0893503407715, "learning_rate": 8.279411764705881e-07, "loss": 13.73794937133789, "reward": 0.6614999771118164, "reward_std": 0.24975880980491638, "rewards/DrugCombAccuracyCOTORM/mean": 0.5924999713897705, "rewards/DrugCombAccuracyCOTORM/std": 0.4264713227748871, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.24720662832260132, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 498.75, "completions/min_length": 407.0, "epoch": 1.6573529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.9021608829498291, "kl": 0.0028663392295129597, "learning_rate": 8.286764705882354e-07, "loss": 2.9150802220101468e-05, "reward": 0.8835000395774841, "reward_std": 0.01932758092880249, "rewards/DrugCombAccuracyCOTORM/mean": 0.8647916316986084, "rewards/DrugCombAccuracyCOTORM/std": 0.14349070191383362, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.08606630563735962, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 433.4375, "completions/min_length": 379.0, "epoch": 1.6588235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.006076464429497719, "kl": 0.0022819569567218423, "learning_rate": 8.294117647058824e-07, "loss": 2.290246084157843e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 437.8125, "completions/min_length": 380.0, "epoch": 1.6602941176470587, "frac_reward_zero_std": 1.0, "grad_norm": 0.007259204052388668, "kl": 0.0026693876134231687, "learning_rate": 8.301470588235294e-07, "loss": 2.6786628950503655e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/mean_length": 512.25, "completions/min_length": 448.0, "epoch": 1.6617647058823528, "frac_reward_zero_std": 0.5, "grad_norm": 1.0950783491134644, "kl": 0.0034642070531845093, "learning_rate": 8.308823529411765e-07, "loss": 3.457479033386335e-05, "reward": 0.84333336353302, "reward_std": 0.023570239543914795, "rewards/DrugCombAccuracyCOTORM/mean": 0.8229166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.18726837635040283, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8500000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.1549193412065506, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 443.9375, "completions/min_length": 364.0, "epoch": 1.663235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.006372744683176279, "kl": 0.002669003210030496, "learning_rate": 8.316176470588235e-07, "loss": 2.6538567908573896e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 454.5625, "completions/min_length": 395.0, "epoch": 1.664705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.3518348932266235, "kl": 0.0034815517719835043, "learning_rate": 8.323529411764706e-07, "loss": 3.445472248131409e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 448.875, "completions/min_length": 388.0, "epoch": 1.6661764705882351, "frac_reward_zero_std": 0.5, "grad_norm": 1.2152189016342163, "kl": 0.0033907567558344454, "learning_rate": 8.330882352941177e-07, "loss": 3.407895565032959e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 427.625, "completions/min_length": 400.0, "epoch": 1.6676470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.015046849846839905, "kl": 0.002874185796827078, "learning_rate": 8.338235294117647e-07, "loss": 2.8477305022533983e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 451.1875, "completions/min_length": 425.0, "epoch": 1.6691176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.008128528483211994, "kl": 0.0027164346538484097, "learning_rate": 8.345588235294117e-07, "loss": 2.7150410460308194e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 464.1875, "completions/min_length": 416.0, "epoch": 1.6705882352941175, "frac_reward_zero_std": 0.5, "grad_norm": 1.1497712135314941, "kl": 0.002714523463509977, "learning_rate": 8.352941176470589e-07, "loss": 2.721258351812139e-05, "reward": 0.7749999761581421, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.40824830532073975, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.40824830532073975, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 506.0625, "completions/min_length": 444.0, "epoch": 1.6720588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9662876725196838, "kl": 0.0028859179583378136, "learning_rate": 8.360294117647059e-07, "loss": 2.8708489480777644e-05, "reward": 0.15625, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.0625, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0625, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 461.4375, "completions/min_length": 365.0, "epoch": 1.673529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8796541094779968, "kl": 0.003273594717029482, "learning_rate": 8.367647058823529e-07, "loss": 3.246963024139404e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/mean_length": 510.25, "completions/min_length": 453.0, "epoch": 1.675, "frac_reward_zero_std": 0.5, "grad_norm": 0.8160582780838013, "kl": 0.0023469082079827785, "learning_rate": 8.375e-07, "loss": 2.3467218852601945e-05, "reward": 0.8311166763305664, "reward_std": 0.1805875599384308, "rewards/DrugCombAccuracyCOTORM/mean": 0.8008750081062317, "rewards/DrugCombAccuracyCOTORM/std": 0.3562396168708801, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9041666984558105, "rewards/DrugCombCoverageCOTORM/std": 0.17207878828048706, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 481.9375, "completions/min_length": 445.0, "epoch": 1.6764705882352942, "frac_reward_zero_std": 0.0, "grad_norm": 1.6813253164291382, "kl": 0.002648472087457776, "learning_rate": 8.38235294117647e-07, "loss": 2.6378780603408813e-05, "reward": 0.8714062571525574, "reward_std": 0.3039039969444275, "rewards/DrugCombAccuracyCOTORM/mean": 0.8578125238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.31907665729522705, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8515625, "rewards/DrugCombCoverageCOTORM/std": 0.5025326609611511, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 466.9375, "completions/min_length": 402.0, "epoch": 1.6779411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 1.509328007698059, "kl": 0.004537993751000613, "learning_rate": 8.389705882352941e-07, "loss": 4.552304744720459e-05, "reward": 0.8312499523162842, "reward_std": 0.36740854382514954, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 1141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 432.9375, "completions/min_length": 392.0, "epoch": 1.6794117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.0579485893249512, "kl": 0.00433750994852744, "learning_rate": 8.397058823529412e-07, "loss": 4.2125582695007324e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 478.6875, "completions/min_length": 439.0, "epoch": 1.6808823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.3653461933135986, "kl": 0.0034054037532769144, "learning_rate": 8.404411764705882e-07, "loss": 3.4528609830886126e-05, "reward": 0.5874999761581421, "reward_std": 0.0353553369641304, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 1143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 451.4375, "completions/min_length": 419.0, "epoch": 1.6823529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.007461516186594963, "kl": 0.0027671200805343688, "learning_rate": 8.411764705882352e-07, "loss": 2.7688236514222808e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 475.9375, "completions/min_length": 420.0, "epoch": 1.6838235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.7535749673843384, "kl": 0.003386695636436343, "learning_rate": 8.419117647058824e-07, "loss": 3.3721327781677246e-05, "reward": 0.7749999761581421, "reward_std": 0.3919961452484131, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 445.0625, "completions/min_length": 325.0, "epoch": 1.6852941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.803257942199707, "kl": 0.0025450088432990015, "learning_rate": 8.426470588235294e-07, "loss": 2.5469675165368244e-05, "reward": 0.5845000147819519, "reward_std": 0.0647699236869812, "rewards/DrugCombAccuracyCOTORM/mean": 0.5274999737739563, "rewards/DrugCombAccuracyCOTORM/std": 0.49293002486228943, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.6540472507476807, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 495.1875, "completions/min_length": 453.0, "epoch": 1.686764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.005470095202326775, "kl": 0.0023135016672313213, "learning_rate": 8.433823529411764e-07, "loss": 2.311548450961709e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 427.0, "completions/min_length": 398.0, "epoch": 1.688235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.012724118307232857, "kl": 0.003386598138604313, "learning_rate": 8.441176470588235e-07, "loss": 3.3885709854075685e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 477.3125, "completions/min_length": 391.0, "epoch": 1.6897058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.006582160014659166, "kl": 0.0026023723767139018, "learning_rate": 8.448529411764706e-07, "loss": 2.5958379410440102e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 441.3125, "completions/min_length": 414.0, "epoch": 1.6911764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.006065784487873316, "kl": 0.002509134355932474, "learning_rate": 8.455882352941176e-07, "loss": 2.4983048206195235e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/mean_length": 472.9375, "completions/min_length": 419.0, "epoch": 1.6926470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9476182460784912, "kl": 0.00312062562443316, "learning_rate": 8.463235294117647e-07, "loss": 3.149257827317342e-05, "reward": 0.574999988079071, "reward_std": 0.17320507764816284, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.25, "rewards/DrugCombCoverageCOTORM/std": 0.9309493899345398, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 469.0, "completions/min_length": 400.0, "epoch": 1.6941176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.8512783646583557, "kl": 0.0033139680163003504, "learning_rate": 8.470588235294117e-07, "loss": 3.32370400428772e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/mean_length": 479.8125, "completions/min_length": 387.0, "epoch": 1.6955882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.867093026638031, "kl": 0.002729204250499606, "learning_rate": 8.477941176470587e-07, "loss": 2.720579504966736e-05, "reward": 0.7581833600997925, "reward_std": 0.20224449038505554, "rewards/DrugCombAccuracyCOTORM/mean": 0.7232500314712524, "rewards/DrugCombAccuracyCOTORM/std": 0.42540374398231506, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7958333492279053, "rewards/DrugCombCoverageCOTORM/std": 0.33040380477905273, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 472.125, "completions/min_length": 401.0, "epoch": 1.6970588235294117, "frac_reward_zero_std": 0.0, "grad_norm": 1.6623953580856323, "kl": 0.004487751342821866, "learning_rate": 8.485294117647059e-07, "loss": 4.413723945617676e-05, "reward": 0.46158337593078613, "reward_std": 0.2871585488319397, "rewards/DrugCombAccuracyCOTORM/mean": 0.4025000035762787, "rewards/DrugCombAccuracyCOTORM/std": 0.38047048449516296, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3958333432674408, "rewards/DrugCombCoverageCOTORM/std": 0.35420751571655273, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/mean_length": 465.375, "completions/min_length": 383.0, "epoch": 1.6985294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 1.0037918090820312, "kl": 0.0028991374420002103, "learning_rate": 8.492647058823529e-07, "loss": 2.9020171496085823e-05, "reward": 0.8069062829017639, "reward_std": 0.022535257041454315, "rewards/DrugCombAccuracyCOTORM/mean": 0.7654687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.24536363780498505, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9453125, "rewards/DrugCombCoverageCOTORM/std": 0.06404344737529755, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/mean_length": 463.8125, "completions/min_length": 370.0, "epoch": 1.7, "frac_reward_zero_std": 1.0, "grad_norm": 0.007423972710967064, "kl": 0.0028935965383425355, "learning_rate": 8.499999999999999e-07, "loss": 2.911312913056463e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/mean_length": 390.0, "completions/min_length": 345.0, "epoch": 1.701470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.02775440737605095, "kl": 0.0035653269151225686, "learning_rate": 8.50735294117647e-07, "loss": 3.55176925950218e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 456.125, "completions/min_length": 410.0, "epoch": 1.7029411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.5320310592651367, "kl": 0.005338495131582022, "learning_rate": 8.514705882352941e-07, "loss": 5.182623863220215e-05, "reward": 0.5177083611488342, "reward_std": 0.48121196031570435, "rewards/DrugCombAccuracyCOTORM/mean": 0.4791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.5013870000839233, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.34375, "rewards/DrugCombCoverageCOTORM/std": 0.7899103164672852, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/mean_length": 416.25, "completions/min_length": 362.0, "epoch": 1.7044117647058825, "frac_reward_zero_std": 1.0, "grad_norm": 0.01002279669046402, "kl": 0.0023820280330255628, "learning_rate": 8.522058823529411e-07, "loss": 2.3817308829165995e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 475.4375, "completions/min_length": 402.0, "epoch": 1.7058823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 1.029427170753479, "kl": 0.0030170651734806597, "learning_rate": 8.529411764705882e-07, "loss": 3.009755164384842e-05, "reward": 0.6812499761581421, "reward_std": 0.19988836348056793, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5439056158065796, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 493.4375, "completions/min_length": 429.0, "epoch": 1.7073529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 0.7928440570831299, "kl": 0.0026065042475238442, "learning_rate": 8.536764705882352e-07, "loss": 2.600548032205552e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/mean_length": 493.3125, "completions/min_length": 438.0, "epoch": 1.7088235294117649, "frac_reward_zero_std": 0.5, "grad_norm": 1.0887264013290405, "kl": 0.002858805819414556, "learning_rate": 8.544117647058822e-07, "loss": 2.8736889362335205e-05, "reward": 0.7265416383743286, "reward_std": 0.23563504219055176, "rewards/DrugCombAccuracyCOTORM/mean": 0.7115625143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.44530782103538513, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5729166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.7934191226959229, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 414.4375, "completions/min_length": 350.0, "epoch": 1.710294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.01610865257680416, "kl": 0.0026323641650378704, "learning_rate": 8.551470588235294e-07, "loss": 2.6263716790708713e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 478.4375, "completions/min_length": 427.0, "epoch": 1.711764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.084288239479065, "kl": 0.0019948479312006384, "learning_rate": 8.558823529411764e-07, "loss": 1.9878149032592773e-05, "reward": 0.48741665482521057, "reward_std": 0.35673633217811584, "rewards/DrugCombAccuracyCOTORM/mean": 0.4087499976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.4215507209300995, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6041666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.3890872597694397, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 462.5, "completions/min_length": 377.0, "epoch": 1.7132352941176472, "frac_reward_zero_std": 1.0, "grad_norm": 0.009321687743067741, "kl": 0.002497137029422447, "learning_rate": 8.566176470588234e-07, "loss": 2.5010653189383447e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 466.75, "completions/min_length": 369.0, "epoch": 1.7147058823529413, "frac_reward_zero_std": 0.5, "grad_norm": 0.9072438478469849, "kl": 0.003519760211929679, "learning_rate": 8.573529411764705e-07, "loss": 3.510713577270508e-05, "reward": 0.9588750004768372, "reward_std": 0.056757885962724686, "rewards/DrugCombAccuracyCOTORM/mean": 0.9524999856948853, "rewards/DrugCombAccuracyCOTORM/std": 0.10212194174528122, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.06718549132347107, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 419.0625, "completions/min_length": 355.0, "epoch": 1.7161764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 0.9110025763511658, "kl": 0.003319225972518325, "learning_rate": 8.580882352941176e-07, "loss": 3.263354301452637e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 479.0625, "completions/min_length": 380.0, "epoch": 1.7176470588235295, "frac_reward_zero_std": 0.0, "grad_norm": 1.5095438957214355, "kl": 0.003913494059816003, "learning_rate": 8.588235294117646e-07, "loss": 3.921985626220703e-05, "reward": 0.5720999836921692, "reward_std": 0.3447231650352478, "rewards/DrugCombAccuracyCOTORM/mean": 0.4869999885559082, "rewards/DrugCombAccuracyCOTORM/std": 0.47710126638412476, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.824999988079071, "rewards/DrugCombCoverageCOTORM/std": 0.33366650342941284, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 473.375, "completions/min_length": 406.0, "epoch": 1.7191176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.008167256601154804, "kl": 0.0026070107123814523, "learning_rate": 8.595588235294118e-07, "loss": 2.5954825105145574e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 480.5, "completions/min_length": 415.0, "epoch": 1.7205882352941178, "frac_reward_zero_std": 0.5, "grad_norm": 0.8270265460014343, "kl": 0.0029671091469936073, "learning_rate": 8.602941176470588e-07, "loss": 2.9556724257417955e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 428.75, "completions/min_length": 372.0, "epoch": 1.7220588235294119, "frac_reward_zero_std": 0.5, "grad_norm": 1.2721941471099854, "kl": 0.004903842316707596, "learning_rate": 8.610294117647059e-07, "loss": 4.873424768447876e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 423.625, "completions/min_length": 354.0, "epoch": 1.723529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.009784266352653503, "kl": 0.00289773236727342, "learning_rate": 8.61764705882353e-07, "loss": 2.9062051908113062e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 461.75, "completions/min_length": 350.0, "epoch": 1.725, "frac_reward_zero_std": 0.0, "grad_norm": 1.537125587463379, "kl": 0.003070225357078016, "learning_rate": 8.625e-07, "loss": 3.065168857574463e-05, "reward": 0.5080000162124634, "reward_std": 0.23939456045627594, "rewards/DrugCombAccuracyCOTORM/mean": 0.41624999046325684, "rewards/DrugCombAccuracyCOTORM/std": 0.4741993248462677, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.3333333432674408, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 451.9375, "completions/min_length": 346.0, "epoch": 1.7264705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.1841018199920654, "kl": 0.0038195779197849333, "learning_rate": 8.63235294117647e-07, "loss": 3.795325756072998e-05, "reward": 0.6102625131607056, "reward_std": 0.17959441244602203, "rewards/DrugCombAccuracyCOTORM/mean": 0.5944687724113464, "rewards/DrugCombAccuracyCOTORM/std": 0.4814327657222748, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.34687501192092896, "rewards/DrugCombCoverageCOTORM/std": 0.9408274292945862, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/mean_length": 404.625, "completions/min_length": 376.0, "epoch": 1.7279411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.006443818565458059, "kl": 0.002491776889655739, "learning_rate": 8.639705882352941e-07, "loss": 2.4897915864130482e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 454.625, "completions/min_length": 365.0, "epoch": 1.7294117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.2779185771942139, "kl": 0.003262733342126012, "learning_rate": 8.647058823529412e-07, "loss": 3.2357871532440186e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/mean_length": 489.4375, "completions/min_length": 416.0, "epoch": 1.7308823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.1202030181884766, "kl": 0.0034111607237719, "learning_rate": 8.654411764705882e-07, "loss": 3.3656870073173195e-05, "reward": 0.38750001788139343, "reward_std": 0.22638463973999023, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.6191391944885254, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 445.75, "completions/min_length": 343.0, "epoch": 1.7323529411764707, "frac_reward_zero_std": 0.0, "grad_norm": 1.4898468255996704, "kl": 0.002631027193274349, "learning_rate": 8.661764705882353e-07, "loss": 2.635270357131958e-05, "reward": 0.637499988079071, "reward_std": 0.3619407117366791, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 451.75, "completions/min_length": 389.0, "epoch": 1.7338235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.012458947487175465, "kl": 0.003116608306299895, "learning_rate": 8.669117647058823e-07, "loss": 3.123133865301497e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 447.1875, "completions/min_length": 408.0, "epoch": 1.7352941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 0.9640889763832092, "kl": 0.0031399158178828657, "learning_rate": 8.676470588235294e-07, "loss": 3.144350921502337e-05, "reward": 0.512499988079071, "reward_std": 0.0353553369641304, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 440.5, "completions/min_length": 365.0, "epoch": 1.736764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.009494413621723652, "kl": 0.00295446248492226, "learning_rate": 8.683823529411765e-07, "loss": 2.9794357033097185e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 431.1875, "completions/min_length": 393.0, "epoch": 1.738235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.003939324524253607, "kl": 0.0016631362377665937, "learning_rate": 8.691176470588235e-07, "loss": 1.654220795899164e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 454.9375, "completions/min_length": 402.0, "epoch": 1.7397058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.014582017436623573, "kl": 0.003663020266685635, "learning_rate": 8.698529411764705e-07, "loss": 3.687303978949785e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 445.125, "completions/min_length": 370.0, "epoch": 1.7411764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.012410324066877365, "kl": 0.004016881983261555, "learning_rate": 8.705882352941177e-07, "loss": 4.002909918199293e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/mean_length": 542.875, "completions/min_length": 418.0, "epoch": 1.7426470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.6504271030426025, "kl": 0.0021270166034810245, "learning_rate": 8.713235294117647e-07, "loss": 2.1406287487479858e-05, "reward": 0.7708874940872192, "reward_std": 0.20805291831493378, "rewards/DrugCombAccuracyCOTORM/mean": 0.7249374985694885, "rewards/DrugCombAccuracyCOTORM/std": 0.44349005818367004, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.925000011920929, "rewards/DrugCombCoverageCOTORM/std": 0.25166115164756775, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 460.5625, "completions/min_length": 409.0, "epoch": 1.7441176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.260284423828125, "kl": 0.002507448458345607, "learning_rate": 8.720588235294117e-07, "loss": 2.541765570640564e-05, "reward": 0.5133333206176758, "reward_std": 0.40299826860427856, "rewards/DrugCombAccuracyCOTORM/mean": 0.4437500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.4539732038974762, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5833333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.4303314983844757, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 459.0625, "completions/min_length": 381.0, "epoch": 1.7455882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.0792042016983032, "kl": 0.003734201251063496, "learning_rate": 8.727941176470588e-07, "loss": 3.715746061061509e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 427.8125, "completions/min_length": 400.0, "epoch": 1.7470588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.009183474816381931, "kl": 0.003193577693309635, "learning_rate": 8.735294117647058e-07, "loss": 3.225011096219532e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 443.875, "completions/min_length": 375.0, "epoch": 1.7485294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.8816631436347961, "kl": 0.002432824345305562, "learning_rate": 8.742647058823529e-07, "loss": 2.4743378162384033e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 472.0, "completions/min_length": 441.0, "epoch": 1.75, "frac_reward_zero_std": 0.5, "grad_norm": 1.0453906059265137, "kl": 0.0031222396064549685, "learning_rate": 8.75e-07, "loss": 3.1057272281032056e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 468.1875, "completions/min_length": 374.0, "epoch": 1.7514705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.0369834899902344, "kl": 0.004258816945366561, "learning_rate": 8.75735294117647e-07, "loss": 4.192542837699875e-05, "reward": 0.78125, "reward_std": 0.22350695729255676, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5439056158065796, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 451.4375, "completions/min_length": 412.0, "epoch": 1.7529411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.017437584698200226, "kl": 0.0033715636818669736, "learning_rate": 8.76470588235294e-07, "loss": 3.4038406738545746e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 457.5, "completions/min_length": 395.0, "epoch": 1.7544117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.009689929895102978, "kl": 0.0029678225982934237, "learning_rate": 8.772058823529412e-07, "loss": 3.0036779207875952e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 444.5, "completions/min_length": 349.0, "epoch": 1.7558823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 1.6866650581359863, "kl": 0.003463706118054688, "learning_rate": 8.779411764705882e-07, "loss": 3.457069396972656e-05, "reward": 0.7749999761581421, "reward_std": 0.41661903262138367, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 455.5, "completions/min_length": 388.0, "epoch": 1.7573529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.0066758752800524235, "kl": 0.0024803831765893847, "learning_rate": 8.786764705882352e-07, "loss": 2.4690378268132918e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 419.25, "completions/min_length": 341.0, "epoch": 1.7588235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.940794825553894, "kl": 0.0038486983394250274, "learning_rate": 8.794117647058823e-07, "loss": 3.8383899664040655e-05, "reward": 0.8374999761581421, "reward_std": 0.22638463973999023, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 464.875, "completions/min_length": 376.0, "epoch": 1.7602941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.0192674398422241, "kl": 0.0027792674372904003, "learning_rate": 8.801470588235293e-07, "loss": 2.7832677005790174e-05, "reward": 0.7552083730697632, "reward_std": 0.20202043652534485, "rewards/DrugCombAccuracyCOTORM/mean": 0.7291666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.71875, "rewards/DrugCombCoverageCOTORM/std": 0.682367205619812, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 450.6875, "completions/min_length": 398.0, "epoch": 1.761764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.008062902837991714, "kl": 0.0030956692062318325, "learning_rate": 8.808823529411764e-07, "loss": 3.086279320996255e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 419.75, "completions/min_length": 361.0, "epoch": 1.763235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.005543859675526619, "kl": 0.0023655705736018717, "learning_rate": 8.816176470588235e-07, "loss": 2.370853690081276e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 481.125, "completions/min_length": 413.0, "epoch": 1.7647058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.016595885157585144, "kl": 0.0035440564970485866, "learning_rate": 8.823529411764705e-07, "loss": 3.550010296748951e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 517.375, "completions/min_length": 465.0, "epoch": 1.7661764705882352, "frac_reward_zero_std": 1.0, "grad_norm": 0.006458481773734093, "kl": 0.002450467727612704, "learning_rate": 8.830882352941175e-07, "loss": 2.443797347950749e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 420.75, "completions/min_length": 350.0, "epoch": 1.7676470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.014738762751221657, "kl": 0.0031885259377304465, "learning_rate": 8.838235294117647e-07, "loss": 3.229444700991735e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/mean_length": 483.0, "completions/min_length": 414.0, "epoch": 1.7691176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.71439528465271, "kl": 0.0024293624446727335, "learning_rate": 8.845588235294117e-07, "loss": 2.4378299713134766e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 431.0, "completions/min_length": 384.0, "epoch": 1.7705882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.0759276151657104, "kl": 0.0029018615023232996, "learning_rate": 8.852941176470587e-07, "loss": 2.825474803103134e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 437.9375, "completions/min_length": 376.0, "epoch": 1.7720588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.03835015743970871, "kl": 0.006739721342455596, "learning_rate": 8.860294117647058e-07, "loss": 6.962155748624355e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/mean_length": 509.1875, "completions/min_length": 436.0, "epoch": 1.7735294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.009051757864654064, "kl": 0.002487080608261749, "learning_rate": 8.867647058823528e-07, "loss": 2.4874116206774488e-05, "reward": 0.15000000596046448, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 469.1875, "completions/min_length": 421.0, "epoch": 1.775, "frac_reward_zero_std": 1.0, "grad_norm": 0.005243734456598759, "kl": 0.002114876842824742, "learning_rate": 8.874999999999999e-07, "loss": 2.119310556736309e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 411.25, "completions/min_length": 328.0, "epoch": 1.776470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.0085880756378174, "kl": 0.0032308200607076287, "learning_rate": 8.88235294117647e-07, "loss": 3.207128611393273e-05, "reward": 0.75, "reward_std": 0.26726123690605164, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 441.0, "completions/min_length": 367.0, "epoch": 1.7779411764705881, "frac_reward_zero_std": 0.0, "grad_norm": 1.450483798980713, "kl": 0.003090371610596776, "learning_rate": 8.88970588235294e-07, "loss": 3.0077993869781494e-05, "reward": 0.8999999761581421, "reward_std": 0.2828426957130432, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 468.3125, "completions/min_length": 408.0, "epoch": 1.7794117647058822, "frac_reward_zero_std": 0.0, "grad_norm": 1.5171111822128296, "kl": 0.002491330204065889, "learning_rate": 8.89705882352941e-07, "loss": 2.5019049644470215e-05, "reward": 0.8428499698638916, "reward_std": 0.3464616537094116, "rewards/DrugCombAccuracyCOTORM/mean": 0.8207499980926514, "rewards/DrugCombAccuracyCOTORM/std": 0.3863793611526489, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.862500011920929, "rewards/DrugCombCoverageCOTORM/std": 0.30740854144096375, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/mean_length": 534.75, "completions/min_length": 472.0, "epoch": 1.7808823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 1.7039709091186523, "kl": 0.0033476820099167526, "learning_rate": 8.904411764705882e-07, "loss": 3.3602118492126465e-05, "reward": 0.7024999856948853, "reward_std": 0.29494357109069824, "rewards/DrugCombAccuracyCOTORM/mean": 0.639843761920929, "rewards/DrugCombAccuracyCOTORM/std": 0.4254443645477295, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/mean_length": 482.5625, "completions/min_length": 418.0, "epoch": 1.7823529411764705, "frac_reward_zero_std": 0.0, "grad_norm": 1.6634196043014526, "kl": 0.0035436717444099486, "learning_rate": 8.911764705882353e-07, "loss": 3.5643577575683594e-05, "reward": 0.887499988079071, "reward_std": 0.3181980550289154, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/mean_length": 492.5, "completions/min_length": 416.0, "epoch": 1.7838235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 1.2899792194366455, "kl": 0.003320447343867272, "learning_rate": 8.919117647058824e-07, "loss": 3.301352262496948e-05, "reward": 0.7250000238418579, "reward_std": 0.24928468465805054, "rewards/DrugCombAccuracyCOTORM/mean": 0.71875, "rewards/DrugCombAccuracyCOTORM/std": 0.44604745507240295, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 418.75, "completions/min_length": 371.0, "epoch": 1.7852941176470587, "frac_reward_zero_std": 1.0, "grad_norm": 0.008923537097871304, "kl": 0.002785290824249387, "learning_rate": 8.926470588235294e-07, "loss": 2.7863250579684973e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 471.75, "completions/min_length": 360.0, "epoch": 1.7867647058823528, "frac_reward_zero_std": 1.0, "grad_norm": 0.02680952101945877, "kl": 0.004067441739607602, "learning_rate": 8.933823529411765e-07, "loss": 4.070677459822036e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/mean_length": 472.8125, "completions/min_length": 415.0, "epoch": 1.788235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1178478002548218, "kl": 0.0023584214213769883, "learning_rate": 8.941176470588236e-07, "loss": 2.331751602469012e-05, "reward": 0.843583345413208, "reward_std": 0.06894412636756897, "rewards/DrugCombAccuracyCOTORM/mean": 0.8331249952316284, "rewards/DrugCombAccuracyCOTORM/std": 0.19901038706302643, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7708333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.5013869404792786, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 478.0, "completions/min_length": 424.0, "epoch": 1.789705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.0053485664539039135, "kl": 0.002318498882232234, "learning_rate": 8.948529411764706e-07, "loss": 2.33856299018953e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 463.6875, "completions/min_length": 405.0, "epoch": 1.7911764705882351, "frac_reward_zero_std": 1.0, "grad_norm": 0.010216987691819668, "kl": 0.0024235183373093605, "learning_rate": 8.955882352941176e-07, "loss": 2.421159115328919e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/mean_length": 469.8125, "completions/min_length": 311.0, "epoch": 1.7926470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 1.0392974615097046, "kl": 0.0036025623558089137, "learning_rate": 8.963235294117648e-07, "loss": 3.591179847717285e-05, "reward": 0.6556442975997925, "reward_std": 0.052415791898965836, "rewards/DrugCombAccuracyCOTORM/mean": 0.5867428779602051, "rewards/DrugCombAccuracyCOTORM/std": 0.4333403706550598, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.862500011920929, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/mean_length": 526.25, "completions/min_length": 431.0, "epoch": 1.7941176470588234, "frac_reward_zero_std": 0.0, "grad_norm": 1.5792597532272339, "kl": 0.0031870705424807966, "learning_rate": 8.970588235294118e-07, "loss": 3.2007694244384766e-05, "reward": 0.21649999916553497, "reward_std": 0.19787031412124634, "rewards/DrugCombAccuracyCOTORM/mean": 0.08312500268220901, "rewards/DrugCombAccuracyCOTORM/std": 0.2508510649204254, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.7745966911315918, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 486.0, "completions/min_length": 428.0, "epoch": 1.7955882352941175, "frac_reward_zero_std": 0.5, "grad_norm": 1.3250010013580322, "kl": 0.003296039823908359, "learning_rate": 8.977941176470588e-07, "loss": 3.275275230407715e-05, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/mean_length": 410.8125, "completions/min_length": 363.0, "epoch": 1.7970588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9507681131362915, "kl": 0.00248358934186399, "learning_rate": 8.985294117647059e-07, "loss": 2.4916338588809595e-05, "reward": 0.8653750419616699, "reward_std": 0.18580015003681183, "rewards/DrugCombAccuracyCOTORM/mean": 0.8434374928474426, "rewards/DrugCombAccuracyCOTORM/std": 0.3365992605686188, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.20155644416809082, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 426.375, "completions/min_length": 344.0, "epoch": 1.798529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.03054756298661232, "kl": 0.0037996157770976424, "learning_rate": 8.992647058823529e-07, "loss": 3.8099118683021516e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 482.8125, "completions/min_length": 429.0, "epoch": 1.8, "frac_reward_zero_std": 0.5, "grad_norm": 0.9725350737571716, "kl": 0.0028140234062448144, "learning_rate": 9e-07, "loss": 2.7835571017931215e-05, "reward": 0.737500011920929, "reward_std": 0.219983771443367, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 434.375, "completions/min_length": 381.0, "epoch": 1.8014705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.013807356357574463, "kl": 0.002400466473773122, "learning_rate": 9.007352941176471e-07, "loss": 2.3848040655138902e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 424.1875, "completions/min_length": 371.0, "epoch": 1.8029411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.0044598579406738, "kl": 0.002637981961015612, "learning_rate": 9.014705882352941e-07, "loss": 2.6497989892959595e-05, "reward": 0.7270833253860474, "reward_std": 0.18429133296012878, "rewards/DrugCombAccuracyCOTORM/mean": 0.6666666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.4714045524597168, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 472.5, "completions/min_length": 436.0, "epoch": 1.8044117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.2338688373565674, "kl": 0.004745206388179213, "learning_rate": 9.022058823529411e-07, "loss": 4.6879053115844727e-05, "reward": 0.8374999761581421, "reward_std": 0.22638463973999023, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 458.6875, "completions/min_length": 377.0, "epoch": 1.8058823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.9667945504188538, "kl": 0.00226747925626114, "learning_rate": 9.029411764705883e-07, "loss": 2.269446849822998e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 410.1875, "completions/min_length": 375.0, "epoch": 1.8073529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.000914216041565, "kl": 0.0027017566026188433, "learning_rate": 9.036764705882353e-07, "loss": 2.7239322662353516e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 444.375, "completions/min_length": 408.0, "epoch": 1.8088235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0552252531051636, "kl": 0.0034375435789115727, "learning_rate": 9.044117647058823e-07, "loss": 3.428123818594031e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 507.0625, "completions/min_length": 421.0, "epoch": 1.8102941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.0821808576583862, "kl": 0.003410515666473657, "learning_rate": 9.051470588235294e-07, "loss": 3.399953857297078e-05, "reward": 0.8356666564941406, "reward_std": 0.17567972838878632, "rewards/DrugCombAccuracyCOTORM/mean": 0.8050000071525574, "rewards/DrugCombAccuracyCOTORM/std": 0.3488266170024872, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.14907118678092957, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 485.25, "completions/min_length": 375.0, "epoch": 1.811764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.5671448707580566, "kl": 0.0034193163155578077, "learning_rate": 9.058823529411764e-07, "loss": 3.4339725971221924e-05, "reward": 0.3375000059604645, "reward_std": 0.24237501621246338, "rewards/DrugCombAccuracyCOTORM/mean": 0.1875, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/mean_length": 558.1875, "completions/min_length": 444.0, "epoch": 1.813235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.2099913358688354, "kl": 0.003046369267394766, "learning_rate": 9.066176470588235e-07, "loss": 3.0428171157836914e-05, "reward": 0.7678382396697998, "reward_std": 0.2261258363723755, "rewards/DrugCombAccuracyCOTORM/mean": 0.7358394861221313, "rewards/DrugCombAccuracyCOTORM/std": 0.26782673597335815, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.4881560504436493, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/mean_length": 489.625, "completions/min_length": 403.0, "epoch": 1.8147058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.0474976301193237, "kl": 0.0035450313007459044, "learning_rate": 9.073529411764706e-07, "loss": 3.4205615520477295e-05, "reward": 0.6452499628067017, "reward_std": 0.06266926974058151, "rewards/DrugCombAccuracyCOTORM/mean": 0.574791669845581, "rewards/DrugCombAccuracyCOTORM/std": 0.45180949568748474, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.17612075805664062, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 438.3125, "completions/min_length": 371.0, "epoch": 1.8161764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0632476806640625, "kl": 0.003240767226088792, "learning_rate": 9.080882352941176e-07, "loss": 3.232806921005249e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 426.5, "completions/min_length": 352.0, "epoch": 1.8176470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.970720112323761, "kl": 0.0028971885913051665, "learning_rate": 9.088235294117646e-07, "loss": 2.886615038732998e-05, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 485.625, "completions/min_length": 439.0, "epoch": 1.8191176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 1.3308748006820679, "kl": 0.0030477058608084917, "learning_rate": 9.095588235294118e-07, "loss": 3.0182301998138428e-05, "reward": 0.33385416865348816, "reward_std": 0.23440171778202057, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 478.625, "completions/min_length": 431.0, "epoch": 1.8205882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.9809772968292236, "kl": 0.0037884394405409694, "learning_rate": 9.102941176470588e-07, "loss": 3.7878373404964805e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 450.0625, "completions/min_length": 400.0, "epoch": 1.8220588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.014214704744517803, "kl": 0.003725620044860989, "learning_rate": 9.110294117647058e-07, "loss": 3.7274588976288214e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 446.875, "completions/min_length": 407.0, "epoch": 1.8235294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.02207564190030098, "kl": 0.005143693764694035, "learning_rate": 9.117647058823529e-07, "loss": 5.077888636151329e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 431.6875, "completions/min_length": 385.0, "epoch": 1.825, "frac_reward_zero_std": 1.0, "grad_norm": 0.012511692009866238, "kl": 0.0032708581420592964, "learning_rate": 9.124999999999999e-07, "loss": 3.297632065368816e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 442.625, "completions/min_length": 399.0, "epoch": 1.826470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.011199746280908585, "kl": 0.0029059311491437256, "learning_rate": 9.13235294117647e-07, "loss": 2.9288563382579014e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/mean_length": 406.9375, "completions/min_length": 383.0, "epoch": 1.8279411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.009027227759361267, "kl": 0.00329053244786337, "learning_rate": 9.139705882352941e-07, "loss": 3.3032309147529304e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 485.8125, "completions/min_length": 428.0, "epoch": 1.8294117647058825, "frac_reward_zero_std": 1.0, "grad_norm": 0.008301806636154652, "kl": 0.002441459335386753, "learning_rate": 9.147058823529411e-07, "loss": 2.4425344236078672e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 870.0, "completions/mean_length": 539.0, "completions/min_length": 358.0, "epoch": 1.8308823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 0.7897012829780579, "kl": 0.0021962608152534813, "learning_rate": 9.154411764705881e-07, "loss": 2.2131771402200684e-05, "reward": 0.6868369579315186, "reward_std": 0.1920088827610016, "rewards/DrugCombAccuracyCOTORM/mean": 0.6565372943878174, "rewards/DrugCombAccuracyCOTORM/std": 0.44127893447875977, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6160714626312256, "rewards/DrugCombCoverageCOTORM/std": 0.8025787472724915, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 437.5, "completions/min_length": 378.0, "epoch": 1.8323529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.014463381841778755, "kl": 0.002900131803471595, "learning_rate": 9.161764705882353e-07, "loss": 2.9237286071293056e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 437.75, "completions/min_length": 401.0, "epoch": 1.8338235294117649, "frac_reward_zero_std": 0.5, "grad_norm": 1.0550150871276855, "kl": 0.0030776027706451714, "learning_rate": 9.169117647058823e-07, "loss": 3.113140701316297e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 453.0625, "completions/min_length": 421.0, "epoch": 1.835294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.007206116337329149, "kl": 0.002555929298978299, "learning_rate": 9.176470588235293e-07, "loss": 2.547793701523915e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/mean_length": 442.75, "completions/min_length": 358.0, "epoch": 1.836764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.019296573475003242, "kl": 0.0033699495252221823, "learning_rate": 9.183823529411764e-07, "loss": 3.363611176609993e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 459.5625, "completions/min_length": 416.0, "epoch": 1.8382352941176472, "frac_reward_zero_std": 1.0, "grad_norm": 0.006791543681174517, "kl": 0.0028222216642461717, "learning_rate": 9.191176470588234e-07, "loss": 2.815543848555535e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/mean_length": 476.6875, "completions/min_length": 396.0, "epoch": 1.8397058823529413, "frac_reward_zero_std": 1.0, "grad_norm": 0.01266578957438469, "kl": 0.0033143532928079367, "learning_rate": 9.198529411764705e-07, "loss": 3.322681368445046e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 436.4375, "completions/min_length": 357.0, "epoch": 1.8411764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 1.0548691749572754, "kl": 0.003479706239886582, "learning_rate": 9.205882352941176e-07, "loss": 3.5468488931655884e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/mean_length": 509.875, "completions/min_length": 387.0, "epoch": 1.8426470588235295, "frac_reward_zero_std": 0.0, "grad_norm": 1.3623344898223877, "kl": 0.00345931010087952, "learning_rate": 9.213235294117646e-07, "loss": 3.471970558166504e-05, "reward": 0.8284749984741211, "reward_std": 0.36304759979248047, "rewards/DrugCombAccuracyCOTORM/mean": 0.8207499980926514, "rewards/DrugCombAccuracyCOTORM/std": 0.3863793611526489, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.71875, "rewards/DrugCombCoverageCOTORM/std": 0.682367205619812, "step": 1253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 502.1875, "completions/min_length": 418.0, "epoch": 1.8441176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.0944877862930298, "kl": 0.0029237547423690557, "learning_rate": 9.220588235294117e-07, "loss": 2.92360782623291e-05, "reward": 0.8224583268165588, "reward_std": 0.20092269778251648, "rewards/DrugCombAccuracyCOTORM/mean": 0.7950000166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.3817503750324249, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8645833730697632, "rewards/DrugCombCoverageCOTORM/std": 0.2803354561328888, "step": 1254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 464.0, "completions/min_length": 417.0, "epoch": 1.8455882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.050468094646930695, "kl": 0.004042249609483406, "learning_rate": 9.227941176470589e-07, "loss": 4.071333751198836e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 443.1875, "completions/min_length": 361.0, "epoch": 1.8470588235294119, "frac_reward_zero_std": 1.0, "grad_norm": 0.023213811218738556, "kl": 0.0029536697547882795, "learning_rate": 9.235294117647059e-07, "loss": 2.966707688756287e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 480.75, "completions/min_length": 421.0, "epoch": 1.848529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8790351748466492, "kl": 0.0030106239137239754, "learning_rate": 9.242647058823529e-07, "loss": 2.9942812034278177e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 471.4375, "completions/min_length": 404.0, "epoch": 1.85, "frac_reward_zero_std": 0.5, "grad_norm": 0.879844069480896, "kl": 0.0028221646207384765, "learning_rate": 9.25e-07, "loss": 2.8189271688461304e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 458.3125, "completions/min_length": 429.0, "epoch": 1.8514705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.007582556921988726, "kl": 0.0027196017326787114, "learning_rate": 9.257352941176471e-07, "loss": 2.716107337619178e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 501.75, "completions/min_length": 430.0, "epoch": 1.8529411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.05969620868563652, "kl": 0.003699678141856566, "learning_rate": 9.264705882352941e-07, "loss": 3.787626337725669e-05, "reward": 0.6410000324249268, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5824999809265137, "rewards/DrugCombAccuracyCOTORM/std": 0.43119215965270996, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/mean_length": 524.6875, "completions/min_length": 399.0, "epoch": 1.8544117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.011697042733430862, "kl": 0.0034581427462399006, "learning_rate": 9.272058823529412e-07, "loss": 3.4680870157899335e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 1261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 420.0, "completions/min_length": 349.0, "epoch": 1.8558823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.2697902917861938, "kl": 0.0025693417992442846, "learning_rate": 9.279411764705882e-07, "loss": 2.562999725341797e-05, "reward": 0.5874999761581421, "reward_std": 0.0353553369641304, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 1262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 457.875, "completions/min_length": 357.0, "epoch": 1.8573529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 0.8957722187042236, "kl": 0.0032788938842713833, "learning_rate": 9.286764705882352e-07, "loss": 3.310292959213257e-05, "reward": 0.643750011920929, "reward_std": 0.1237436980009079, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/mean_length": 474.8125, "completions/min_length": 410.0, "epoch": 1.8588235294117648, "frac_reward_zero_std": 0.5, "grad_norm": 1.5642776489257812, "kl": 0.0031509873224422336, "learning_rate": 9.294117647058824e-07, "loss": 3.184827801305801e-05, "reward": 0.6850833296775818, "reward_std": 0.1642644703388214, "rewards/DrugCombAccuracyCOTORM/mean": 0.6454166769981384, "rewards/DrugCombAccuracyCOTORM/std": 0.4415610134601593, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.5335936546325684, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 479.375, "completions/min_length": 396.0, "epoch": 1.8602941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 1.2094361782073975, "kl": 0.0029750067624263465, "learning_rate": 9.301470588235294e-07, "loss": 2.9576922315754928e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 437.4375, "completions/min_length": 367.0, "epoch": 1.861764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.2424817085266113, "kl": 0.0037717934465035796, "learning_rate": 9.308823529411764e-07, "loss": 3.758072853088379e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 410.1875, "completions/min_length": 350.0, "epoch": 1.863235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.00699268002063036, "kl": 0.0027633371064439416, "learning_rate": 9.316176470588235e-07, "loss": 2.7287216653348878e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 453.9375, "completions/min_length": 408.0, "epoch": 1.8647058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.007990648038685322, "kl": 0.002664146333700046, "learning_rate": 9.323529411764706e-07, "loss": 2.6639314455678686e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 411.75, "completions/min_length": 332.0, "epoch": 1.8661764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.007554133888334036, "kl": 0.002630740695167333, "learning_rate": 9.330882352941176e-07, "loss": 2.6622614313964732e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 444.25, "completions/min_length": 378.0, "epoch": 1.8676470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9381530284881592, "kl": 0.0029190699860919267, "learning_rate": 9.338235294117647e-07, "loss": 2.921372652053833e-05, "reward": 0.7571250200271606, "reward_std": 0.20230136811733246, "rewards/DrugCombAccuracyCOTORM/mean": 0.7081249952316284, "rewards/DrugCombAccuracyCOTORM/std": 0.4495474696159363, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.20155644416809082, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/mean_length": 398.1875, "completions/min_length": 365.0, "epoch": 1.8691176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.148412823677063, "kl": 0.0026732602855190635, "learning_rate": 9.345588235294117e-07, "loss": 2.6728957891464233e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 1271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 427.75, "completions/min_length": 372.0, "epoch": 1.8705882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.007371445186436176, "kl": 0.0026109706377610564, "learning_rate": 9.352941176470588e-07, "loss": 2.601289452286437e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 462.625, "completions/min_length": 355.0, "epoch": 1.8720588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.00884606596082449, "kl": 0.0026975636137649417, "learning_rate": 9.360294117647059e-07, "loss": 2.682503327378072e-05, "reward": 0.9033333659172058, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.8999999761581421, "rewards/DrugCombAccuracyCOTORM/std": 0.10327955335378647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 1273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 461.5, "completions/min_length": 379.0, "epoch": 1.8735294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.01010981760919094, "kl": 0.0026850957074202597, "learning_rate": 9.367647058823529e-07, "loss": 2.6775836886372417e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/mean_length": 483.25, "completions/min_length": 373.0, "epoch": 1.875, "frac_reward_zero_std": 0.0, "grad_norm": 1.5601094961166382, "kl": 0.005001068988349289, "learning_rate": 9.374999999999999e-07, "loss": 5.010142922401428e-05, "reward": 0.34627974033355713, "reward_std": 0.3641503155231476, "rewards/DrugCombAccuracyCOTORM/mean": 0.2857142984867096, "rewards/DrugCombAccuracyCOTORM/std": 0.4487321674823761, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.1770833432674408, "rewards/DrugCombCoverageCOTORM/std": 0.961227536201477, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/mean_length": 492.4375, "completions/min_length": 409.0, "epoch": 1.8764705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.8200711011886597, "kl": 0.0032277156715281308, "learning_rate": 9.38235294117647e-07, "loss": 3.194768214598298e-05, "reward": 0.5484374761581421, "reward_std": 0.0044194171205163, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 475.625, "completions/min_length": 422.0, "epoch": 1.8779411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.0858244895935059, "kl": 0.0041462614317424595, "learning_rate": 9.389705882352941e-07, "loss": 4.065409302711487e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/mean_length": 556.4375, "completions/min_length": 462.0, "epoch": 1.8794117647058823, "frac_reward_zero_std": 0.0, "grad_norm": 1.422589659690857, "kl": 0.0029159595142118633, "learning_rate": 9.397058823529411e-07, "loss": 2.8956681489944458e-05, "reward": 0.605672299861908, "reward_std": 0.24233898520469666, "rewards/DrugCombAccuracyCOTORM/mean": 0.5090434551239014, "rewards/DrugCombAccuracyCOTORM/std": 0.4562222957611084, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.0625, "step": 1278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 449.6875, "completions/min_length": 402.0, "epoch": 1.8808823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.008344888687133789, "kl": 0.0027490968932397664, "learning_rate": 9.404411764705882e-07, "loss": 2.7928730560233817e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 432.5625, "completions/min_length": 372.0, "epoch": 1.8823529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.1356576681137085, "kl": 0.0032440699287690222, "learning_rate": 9.411764705882352e-07, "loss": 3.2343312341254205e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 433.6875, "completions/min_length": 390.0, "epoch": 1.8838235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.08629920333623886, "kl": 0.0041561732068657875, "learning_rate": 9.419117647058823e-07, "loss": 4.0398575947619975e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 447.3125, "completions/min_length": 357.0, "epoch": 1.8852941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.7974799871444702, "kl": 0.0035917078494094312, "learning_rate": 9.426470588235294e-07, "loss": 3.62396240234375e-05, "reward": 0.5750000476837158, "reward_std": 0.44636839628219604, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 1282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 441.1875, "completions/min_length": 367.0, "epoch": 1.886764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.9187625050544739, "kl": 0.015320193429943174, "learning_rate": 9.433823529411764e-07, "loss": 0.0001476647303206846, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 438.5625, "completions/min_length": 393.0, "epoch": 1.888235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.017812712118029594, "kl": 0.0033364229020662606, "learning_rate": 9.441176470588234e-07, "loss": 3.359228139743209e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 424.0, "completions/min_length": 351.0, "epoch": 1.8897058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.02206709422171116, "kl": 0.004782153409905732, "learning_rate": 9.448529411764705e-07, "loss": 4.722271842183545e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 507.0, "completions/min_length": 426.0, "epoch": 1.8911764705882352, "frac_reward_zero_std": 0.5, "grad_norm": 1.2659963369369507, "kl": 0.002952971786726266, "learning_rate": 9.455882352941176e-07, "loss": 2.9497183277271688e-05, "reward": 0.746874988079071, "reward_std": 0.20967130362987518, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.08539126068353653, "step": 1286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 485.875, "completions/min_length": 412.0, "epoch": 1.8926470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 0.8758161664009094, "kl": 0.002949591842480004, "learning_rate": 9.463235294117646e-07, "loss": 2.9612259822897613e-05, "reward": 0.6868333220481873, "reward_std": 0.13967616856098175, "rewards/DrugCombAccuracyCOTORM/mean": 0.6449999809265137, "rewards/DrugCombAccuracyCOTORM/std": 0.4190465211868286, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7083333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5288001894950867, "step": 1287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/mean_length": 545.625, "completions/min_length": 454.0, "epoch": 1.8941176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 1.369434118270874, "kl": 0.0023029205040074885, "learning_rate": 9.470588235294117e-07, "loss": 2.300180494785309e-05, "reward": 0.5216888189315796, "reward_std": 0.3177140951156616, "rewards/DrugCombAccuracyCOTORM/mean": 0.43205899000167847, "rewards/DrugCombAccuracyCOTORM/std": 0.3785109221935272, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7604166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.2112463116645813, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 421.125, "completions/min_length": 363.0, "epoch": 1.8955882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.006997051648795605, "kl": 0.0024236428434960544, "learning_rate": 9.477941176470587e-07, "loss": 2.428431616863236e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 445.6875, "completions/min_length": 397.0, "epoch": 1.8970588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.007002672646194696, "kl": 0.0035028946003876626, "learning_rate": 9.485294117647058e-07, "loss": 3.528457091306336e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 497.125, "completions/min_length": 430.0, "epoch": 1.8985294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 1.1898291110992432, "kl": 0.003682423906866461, "learning_rate": 9.492647058823529e-07, "loss": 3.6776793422177434e-05, "reward": 0.9302083253860474, "reward_std": 0.09717614203691483, "rewards/DrugCombAccuracyCOTORM/mean": 0.9166666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.18257419764995575, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 1291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 465.0, "completions/min_length": 413.0, "epoch": 1.9, "frac_reward_zero_std": 0.5, "grad_norm": 0.9270645976066589, "kl": 0.004071150557138026, "learning_rate": 9.499999999999999e-07, "loss": 4.056096076965332e-05, "reward": 0.9833333492279053, "reward_std": 0.047140445560216904, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 475.4375, "completions/min_length": 434.0, "epoch": 1.901470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.021852092817425728, "kl": 0.004025271744467318, "learning_rate": 9.507352941176469e-07, "loss": 4.05301725550089e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 458.75, "completions/min_length": 370.0, "epoch": 1.9029411764705881, "frac_reward_zero_std": 0.0, "grad_norm": 1.4318240880966187, "kl": 0.004563893482554704, "learning_rate": 9.51470588235294e-07, "loss": 4.502385854721069e-05, "reward": 0.550000011920929, "reward_std": 0.3181980550289154, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 1294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/mean_length": 511.3125, "completions/min_length": 432.0, "epoch": 1.9044117647058822, "frac_reward_zero_std": 0.5, "grad_norm": 0.918719470500946, "kl": 0.004578844702336937, "learning_rate": 9.522058823529411e-07, "loss": 4.672926297644153e-05, "reward": 0.9227625131607056, "reward_std": 0.14303673803806305, "rewards/DrugCombAccuracyCOTORM/mean": 0.9069687128067017, "rewards/DrugCombAccuracyCOTORM/std": 0.2542276382446289, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.971875011920929, "rewards/DrugCombCoverageCOTORM/std": 0.07739239931106567, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 436.375, "completions/min_length": 398.0, "epoch": 1.9058823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.2852014303207397, "kl": 0.0030702861258760095, "learning_rate": 9.529411764705881e-07, "loss": 3.067749639740214e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 420.5, "completions/min_length": 332.0, "epoch": 1.9073529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.005584171507507563, "kl": 0.002734527050051838, "learning_rate": 9.536764705882352e-07, "loss": 2.7375686840969138e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 448.375, "completions/min_length": 412.0, "epoch": 1.9088235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.008981208316981792, "kl": 0.002289376745466143, "learning_rate": 9.544117647058823e-07, "loss": 2.2871678083902225e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 420.6875, "completions/min_length": 374.0, "epoch": 1.9102941176470587, "frac_reward_zero_std": 1.0, "grad_norm": 0.013733172789216042, "kl": 0.003952510131057352, "learning_rate": 9.551470588235295e-07, "loss": 3.956106229452416e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 446.6875, "completions/min_length": 393.0, "epoch": 1.9117647058823528, "frac_reward_zero_std": 1.0, "grad_norm": 0.010129190981388092, "kl": 0.002557477040681988, "learning_rate": 9.558823529411764e-07, "loss": 2.5321360226371326e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 458.4375, "completions/min_length": 390.0, "epoch": 1.913235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0222954750061035, "kl": 0.0036815846688114107, "learning_rate": 9.566176470588235e-07, "loss": 3.69454464816954e-05, "reward": 0.8979166746139526, "reward_std": 0.1890740841627121, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 1301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 444.0625, "completions/min_length": 380.0, "epoch": 1.914705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.0843168497085571, "kl": 0.01292444480350241, "learning_rate": 9.573529411764706e-07, "loss": 0.0001232437789440155, "reward": 0.6768749952316284, "reward_std": 0.08401448279619217, "rewards/DrugCombAccuracyCOTORM/mean": 0.6214843988418579, "rewards/DrugCombAccuracyCOTORM/std": 0.40821394324302673, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.2713136672973633, "step": 1302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 399.3125, "completions/min_length": 357.0, "epoch": 1.9161764705882351, "frac_reward_zero_std": 1.0, "grad_norm": 0.010766243562102318, "kl": 0.003553128452040255, "learning_rate": 9.580882352941176e-07, "loss": 3.51166381733492e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/mean_length": 477.0, "completions/min_length": 348.0, "epoch": 1.9176470588235293, "frac_reward_zero_std": 0.0, "grad_norm": 1.564103364944458, "kl": 0.0038646473549306393, "learning_rate": 9.588235294117647e-07, "loss": 3.871321678161621e-05, "reward": 0.6718750596046448, "reward_std": 0.3056429624557495, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.4013864994049072, "rewards/DrugCombCOTFormatORM/mean": 0.9375, "rewards/DrugCombCOTFormatORM/std": 0.17078252136707306, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/mean_length": 404.5625, "completions/min_length": 359.0, "epoch": 1.9191176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.00864215474575758, "kl": 0.0030225044465623796, "learning_rate": 9.595588235294118e-07, "loss": 3.0079076168476604e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 447.5625, "completions/min_length": 406.0, "epoch": 1.9205882352941175, "frac_reward_zero_std": 1.0, "grad_norm": 0.009484972804784775, "kl": 0.003244224935770035, "learning_rate": 9.602941176470587e-07, "loss": 3.260480661992915e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 423.875, "completions/min_length": 369.0, "epoch": 1.9220588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.033164430409669876, "kl": 0.00373479921836406, "learning_rate": 9.610294117647058e-07, "loss": 3.689548611873761e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 473.875, "completions/min_length": 417.0, "epoch": 1.923529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.03657027333974838, "kl": 0.0034127512481063604, "learning_rate": 9.61764705882353e-07, "loss": 3.394788654986769e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 445.0625, "completions/min_length": 356.0, "epoch": 1.925, "frac_reward_zero_std": 0.5, "grad_norm": 1.1786060333251953, "kl": 0.006607943796552718, "learning_rate": 9.624999999999999e-07, "loss": 6.66305422782898e-05, "reward": 0.5874999761581421, "reward_std": 0.172688826918602, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 1309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 452.9375, "completions/min_length": 388.0, "epoch": 1.9264705882352942, "frac_reward_zero_std": 0.0, "grad_norm": 1.8195005655288696, "kl": 0.0039158439612947404, "learning_rate": 9.63235294117647e-07, "loss": 3.897026181221008e-05, "reward": 0.550000011920929, "reward_std": 0.46579423546791077, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 506.9375, "completions/min_length": 438.0, "epoch": 1.9279411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.011172621510922909, "kl": 0.00391664745984599, "learning_rate": 9.639705882352941e-07, "loss": 3.9337286580121145e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 491.3125, "completions/min_length": 405.0, "epoch": 1.9294117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 1.3320612907409668, "kl": 0.002996859315317124, "learning_rate": 9.64705882352941e-07, "loss": 3.0349940061569214e-05, "reward": 0.745187520980835, "reward_std": 0.3847334086894989, "rewards/DrugCombAccuracyCOTORM/mean": 0.7029687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.4585931599140167, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.828125, "rewards/DrugCombCoverageCOTORM/std": 0.49765074253082275, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 446.875, "completions/min_length": 387.0, "epoch": 1.9308823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.1946732997894287, "kl": 0.004186137695796788, "learning_rate": 9.654411764705882e-07, "loss": 4.1507184505462646e-05, "reward": 0.512499988079071, "reward_std": 0.0353553369641304, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 1313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 427.6875, "completions/min_length": 392.0, "epoch": 1.9323529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.012453998439013958, "kl": 0.0032621981808915734, "learning_rate": 9.661764705882353e-07, "loss": 3.261474194005132e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 461.75, "completions/min_length": 409.0, "epoch": 1.9338235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1464349031448364, "kl": 0.0029782880446873605, "learning_rate": 9.669117647058822e-07, "loss": 2.936091914307326e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 781.0, "completions/mean_length": 521.6875, "completions/min_length": 377.0, "epoch": 1.9352941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9159186482429504, "kl": 0.0026814075536094606, "learning_rate": 9.676470588235294e-07, "loss": 2.6881694793701172e-05, "reward": 0.7540090084075928, "reward_std": 0.14876188337802887, "rewards/DrugCombAccuracyCOTORM/mean": 0.7054018974304199, "rewards/DrugCombAccuracyCOTORM/std": 0.4003717303276062, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8968750238418579, "rewards/DrugCombCoverageCOTORM/std": 0.21766597032546997, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 406.0625, "completions/min_length": 335.0, "epoch": 1.936764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0763317346572876, "kl": 0.0025626467540860176, "learning_rate": 9.683823529411765e-07, "loss": 2.5406479835510254e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 469.8125, "completions/min_length": 399.0, "epoch": 1.938235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.005770089570432901, "kl": 0.002655502612469718, "learning_rate": 9.691176470588234e-07, "loss": 2.6465562768862583e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 445.6875, "completions/min_length": 366.0, "epoch": 1.9397058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.013633455149829388, "kl": 0.0028672306798398495, "learning_rate": 9.698529411764705e-07, "loss": 2.865911301341839e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/mean_length": 490.0, "completions/min_length": 389.0, "epoch": 1.9411764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.3672118186950684, "kl": 0.004304926958866417, "learning_rate": 9.705882352941176e-07, "loss": 4.330277442932129e-05, "reward": 0.5062500238418579, "reward_std": 0.3442630469799042, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 421.625, "completions/min_length": 394.0, "epoch": 1.9426470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.012698155827820301, "kl": 0.0030290907598100603, "learning_rate": 9.713235294117646e-07, "loss": 3.017435665242374e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 449.9375, "completions/min_length": 406.0, "epoch": 1.9441176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 1.5286837816238403, "kl": 0.004032375174574554, "learning_rate": 9.720588235294117e-07, "loss": 3.985315561294556e-05, "reward": 0.6812500357627869, "reward_std": 0.43991678953170776, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 1322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/mean_length": 524.0625, "completions/min_length": 458.0, "epoch": 1.9455882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.933505117893219, "kl": 0.0029421254293993115, "learning_rate": 9.727941176470588e-07, "loss": 2.915108052548021e-05, "reward": 0.6524219512939453, "reward_std": 0.015066765248775482, "rewards/DrugCombAccuracyCOTORM/mean": 0.5889649391174316, "rewards/DrugCombAccuracyCOTORM/std": 0.425158828496933, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.20069323480129242, "step": 1323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 443.75, "completions/min_length": 401.0, "epoch": 1.9470588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.008788461796939373, "kl": 0.003221253340598196, "learning_rate": 9.735294117647057e-07, "loss": 3.2084048143588006e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 464.0, "completions/min_length": 372.0, "epoch": 1.9485294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.012495298869907856, "kl": 0.004028320137877017, "learning_rate": 9.742647058823529e-07, "loss": 4.0277940570376813e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/mean_length": 507.3125, "completions/min_length": 416.0, "epoch": 1.95, "frac_reward_zero_std": 0.5, "grad_norm": 1.0880522727966309, "kl": 0.002632551360875368, "learning_rate": 9.75e-07, "loss": 2.6542693376541138e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 441.375, "completions/min_length": 394.0, "epoch": 1.951470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.6610524654388428, "kl": 0.004292204917874187, "learning_rate": 9.75735294117647e-07, "loss": 4.2825937271118164e-05, "reward": 0.78125, "reward_std": 0.3743184804916382, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 1327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 432.25, "completions/min_length": 377.0, "epoch": 1.9529411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.01204905565828085, "kl": 0.003077200148254633, "learning_rate": 9.76470588235294e-07, "loss": 3.073164407396689e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 452.1875, "completions/min_length": 401.0, "epoch": 1.9544117647058825, "frac_reward_zero_std": 1.0, "grad_norm": 0.009557874873280525, "kl": 0.003628185309935361, "learning_rate": 9.772058823529412e-07, "loss": 3.605611709645018e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 478.0625, "completions/min_length": 422.0, "epoch": 1.9558823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 1.0437506437301636, "kl": 0.0033533762325532734, "learning_rate": 9.77941176470588e-07, "loss": 3.36534358211793e-05, "reward": 0.7718750238418579, "reward_std": 0.20152875781059265, "rewards/DrugCombAccuracyCOTORM/mean": 0.71875, "rewards/DrugCombAccuracyCOTORM/std": 0.44604745507240295, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 669.0, "completions/mean_length": 527.4375, "completions/min_length": 420.0, "epoch": 1.9573529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 0.9070714712142944, "kl": 0.0027202399796806276, "learning_rate": 9.786764705882352e-07, "loss": 2.6807188987731934e-05, "reward": 0.6505694389343262, "reward_std": 0.11601655930280685, "rewards/DrugCombAccuracyCOTORM/mean": 0.5779687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.47842535376548767, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8819444179534912, "rewards/DrugCombCoverageCOTORM/std": 0.13739942014217377, "step": 1331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/mean_length": 459.4375, "completions/min_length": 397.0, "epoch": 1.9588235294117649, "frac_reward_zero_std": 0.5, "grad_norm": 0.924108624458313, "kl": 0.0035047035198658705, "learning_rate": 9.794117647058823e-07, "loss": 3.49581241607666e-05, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 411.125, "completions/min_length": 368.0, "epoch": 1.960294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.1140849590301514, "kl": 0.0029031875310465693, "learning_rate": 9.801470588235292e-07, "loss": 2.9042363166809082e-05, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 1333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/mean_length": 471.75, "completions/min_length": 427.0, "epoch": 1.961764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.009747564792633057, "kl": 0.0034836247214116156, "learning_rate": 9.808823529411764e-07, "loss": 3.4870143281295896e-05, "reward": 0.625333309173584, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5733333230018616, "rewards/DrugCombAccuracyCOTORM/std": 0.44065946340560913, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.3442651927471161, "step": 1334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 437.6875, "completions/min_length": 406.0, "epoch": 1.9632352941176472, "frac_reward_zero_std": 0.0, "grad_norm": 1.373192548751831, "kl": 0.003259648336097598, "learning_rate": 9.816176470588235e-07, "loss": 3.262609243392944e-05, "reward": 0.8999999761581421, "reward_std": 0.2828426957130432, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 437.3125, "completions/min_length": 409.0, "epoch": 1.9647058823529413, "frac_reward_zero_std": 1.0, "grad_norm": 0.009611973538994789, "kl": 0.0037744190194644034, "learning_rate": 9.823529411764704e-07, "loss": 3.771860065171495e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 469.75, "completions/min_length": 368.0, "epoch": 1.9661764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 0.9728860855102539, "kl": 0.002841352252289653, "learning_rate": 9.830882352941175e-07, "loss": 2.8471975383581594e-05, "reward": 0.7145833373069763, "reward_std": 0.16557681560516357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6666666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.42163705825805664, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5439056158065796, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 427.125, "completions/min_length": 380.0, "epoch": 1.9676470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.007960121147334576, "kl": 0.003074174339417368, "learning_rate": 9.838235294117647e-07, "loss": 3.080980241065845e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 487.0625, "completions/min_length": 388.0, "epoch": 1.9691176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.8565178513526917, "kl": 0.0029963243869133294, "learning_rate": 9.845588235294118e-07, "loss": 2.9996037483215332e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 1339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 431.0625, "completions/min_length": 374.0, "epoch": 1.9705882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.007786692585796118, "kl": 0.0030221231281757355, "learning_rate": 9.85294117647059e-07, "loss": 3.007660234288778e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 489.5, "completions/min_length": 450.0, "epoch": 1.9720588235294119, "frac_reward_zero_std": 1.0, "grad_norm": 0.006522926967591047, "kl": 0.002737275790423155, "learning_rate": 9.860294117647058e-07, "loss": 2.723697980400175e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 463.5625, "completions/min_length": 424.0, "epoch": 1.973529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.0694248676300049, "kl": 0.0026964182616211474, "learning_rate": 9.86764705882353e-07, "loss": 2.6844441890716553e-05, "reward": 0.6088333129882812, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.5475000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.41562002897262573, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7083333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.4849589467048645, "step": 1342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 411.0625, "completions/min_length": 374.0, "epoch": 1.975, "frac_reward_zero_std": 0.5, "grad_norm": 1.0084348917007446, "kl": 0.0023518106900155544, "learning_rate": 9.875e-07, "loss": 2.3702188627794385e-05, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 1343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 443.625, "completions/min_length": 360.0, "epoch": 1.9764705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.017856264486908913, "kl": 0.003236747463233769, "learning_rate": 9.88235294117647e-07, "loss": 3.243025275878608e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 454.375, "completions/min_length": 414.0, "epoch": 1.9779411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.006009034812450409, "kl": 0.001971102028619498, "learning_rate": 9.889705882352941e-07, "loss": 1.9702451027114876e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 446.25, "completions/min_length": 384.0, "epoch": 1.9794117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.4012643098831177, "kl": 0.0042673127027228475, "learning_rate": 9.897058823529412e-07, "loss": 4.268437623977661e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 460.5625, "completions/min_length": 376.0, "epoch": 1.9808823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 1.679978370666504, "kl": 0.0038805549265816808, "learning_rate": 9.904411764705882e-07, "loss": 3.822147846221924e-05, "reward": 0.3854166567325592, "reward_std": 0.2609178125858307, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.1458333134651184, "rewards/DrugCombCoverageCOTORM/std": 1.003466248512268, "step": 1347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 471.625, "completions/min_length": 395.0, "epoch": 1.9823529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 0.7411580085754395, "kl": 0.0032754713611211628, "learning_rate": 9.911764705882353e-07, "loss": 3.262609243392944e-05, "reward": 0.5089166760444641, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.45125001668930054, "rewards/DrugCombAccuracyCOTORM/std": 0.502684473991394, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5013870000839233, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/mean_length": 516.1875, "completions/min_length": 427.0, "epoch": 1.9838235294117648, "frac_reward_zero_std": 0.5, "grad_norm": 1.2001969814300537, "kl": 0.004137791518587619, "learning_rate": 9.919117647058824e-07, "loss": 4.116259515285492e-05, "reward": 0.7950875163078308, "reward_std": 0.2062573879957199, "rewards/DrugCombAccuracyCOTORM/mean": 0.7770625352859497, "rewards/DrugCombAccuracyCOTORM/std": 0.41038769483566284, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.547722578048706, "step": 1349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 454.25, "completions/min_length": 394.0, "epoch": 1.9852941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 1.3344532251358032, "kl": 0.0034300502738915384, "learning_rate": 9.926470588235293e-07, "loss": 3.406950781936757e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 459.5625, "completions/min_length": 390.0, "epoch": 1.986764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.037982702255249, "kl": 0.0035402399953454733, "learning_rate": 9.933823529411765e-07, "loss": 3.5293400287628174e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/mean_length": 403.625, "completions/min_length": 335.0, "epoch": 1.988235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9796234965324402, "kl": 0.0028969881241209805, "learning_rate": 9.941176470588236e-07, "loss": 2.8968041078769602e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 423.1875, "completions/min_length": 311.0, "epoch": 1.9897058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.7954102754592896, "kl": 0.004010035889223218, "learning_rate": 9.948529411764705e-07, "loss": 3.9968639612197876e-05, "reward": 0.543749988079071, "reward_std": 0.150047168135643, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.13437095284461975, "step": 1353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 490.4375, "completions/min_length": 434.0, "epoch": 1.9911764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8878196477890015, "kl": 0.003289471671450883, "learning_rate": 9.955882352941176e-07, "loss": 3.274030314059928e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 1354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 438.625, "completions/min_length": 387.0, "epoch": 1.9926470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.2147724628448486, "kl": 0.0035403158399276435, "learning_rate": 9.963235294117647e-07, "loss": 3.562122583389282e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/mean_length": 508.9375, "completions/min_length": 393.0, "epoch": 1.9941176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.4307141304016113, "kl": 0.004037493083160371, "learning_rate": 9.970588235294117e-07, "loss": 4.0858983993530273e-05, "reward": 0.6271250247955322, "reward_std": 0.2447950690984726, "rewards/DrugCombAccuracyCOTORM/mean": 0.5404166579246521, "rewards/DrugCombAccuracyCOTORM/std": 0.4545113444328308, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9479166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.145535409450531, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 445.0, "completions/min_length": 406.0, "epoch": 1.9955882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.01922827959060669, "kl": 0.0038925125845707953, "learning_rate": 9.977941176470588e-07, "loss": 3.9223003113875166e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 462.5, "completions/min_length": 395.0, "epoch": 1.9970588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.020838797092437744, "kl": 0.004220697737764567, "learning_rate": 9.98529411764706e-07, "loss": 4.2100149585166946e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 411.4375, "completions/min_length": 330.0, "epoch": 1.9985294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.1528836488723755, "kl": 0.003112180740572512, "learning_rate": 9.992647058823528e-07, "loss": 3.100186586380005e-05, "reward": 0.942187488079071, "reward_std": 0.16351844370365143, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 432.3125, "completions/min_length": 367.0, "epoch": 2.0, "frac_reward_zero_std": 0.5, "grad_norm": 1.0836206674575806, "kl": 0.002795211272314191, "learning_rate": 1e-06, "loss": 2.7809999664896168e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 471.8125, "completions/min_length": 429.0, "epoch": 2.001470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.4520152807235718, "kl": 0.0034074688446708024, "learning_rate": 9.999999835306327e-07, "loss": 3.403425216674805e-05, "reward": 0.48750001192092896, "reward_std": 0.425373911857605, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 1361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 457.8125, "completions/min_length": 423.0, "epoch": 2.0029411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.02517990954220295, "kl": 0.00459965446498245, "learning_rate": 9.999999341225317e-07, "loss": 4.5423483243212104e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 474.5625, "completions/min_length": 383.0, "epoch": 2.0044117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.1921554803848267, "kl": 0.003559120057616383, "learning_rate": 9.999998517757004e-07, "loss": 3.583729267120361e-05, "reward": 0.543749988079071, "reward_std": 0.04172614961862564, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.8139410614967346, "step": 1363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 494.0625, "completions/min_length": 435.0, "epoch": 2.0058823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.7838974595069885, "kl": 0.0027383274282328784, "learning_rate": 9.999997364901446e-07, "loss": 2.7365516871213913e-05, "reward": 0.9140416383743286, "reward_std": 0.15937022864818573, "rewards/DrugCombAccuracyCOTORM/mean": 0.8990625143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.27599647641181946, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9479166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.145535409450531, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 496.0625, "completions/min_length": 463.0, "epoch": 2.0073529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.017412543296814, "kl": 0.004349719267338514, "learning_rate": 9.999995882658709e-07, "loss": 4.3369829654693604e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 534.9375, "completions/min_length": 449.0, "epoch": 2.0088235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 2.2635347843170166, "kl": 0.003661150694824755, "learning_rate": 9.999994071028902e-07, "loss": 3.6578625440597534e-05, "reward": 0.6979583501815796, "reward_std": 0.3504698574542999, "rewards/DrugCombAccuracyCOTORM/mean": 0.6549999713897705, "rewards/DrugCombAccuracyCOTORM/std": 0.40666666626930237, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7395833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.375, "step": 1366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 419.6875, "completions/min_length": 374.0, "epoch": 2.010294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.009557818993926048, "kl": 0.003065536089707166, "learning_rate": 9.999991930012135e-07, "loss": 3.081745308008976e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 436.25, "completions/min_length": 375.0, "epoch": 2.011764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.7392274141311646, "kl": 0.00261986514669843, "learning_rate": 9.999989459608554e-07, "loss": 2.6055073249153793e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/mean_length": 515.625, "completions/min_length": 411.0, "epoch": 2.013235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1444400548934937, "kl": 0.0033704827073961496, "learning_rate": 9.999986659818322e-07, "loss": 3.388058394193649e-05, "reward": 0.7305882573127747, "reward_std": 0.14533217251300812, "rewards/DrugCombAccuracyCOTORM/mean": 0.6905790567398071, "rewards/DrugCombAccuracyCOTORM/std": 0.38679805397987366, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.78125, "rewards/DrugCombCoverageCOTORM/std": 0.49279531836509705, "step": 1369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 465.125, "completions/min_length": 400.0, "epoch": 2.014705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.8810932040214539, "kl": 0.0029359153122641146, "learning_rate": 9.999983530641621e-07, "loss": 2.9476288545993157e-05, "reward": 0.7250000238418579, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 413.6875, "completions/min_length": 343.0, "epoch": 2.0161764705882352, "frac_reward_zero_std": 0.5, "grad_norm": 1.0448203086853027, "kl": 0.0033206097432412207, "learning_rate": 9.99998007207866e-07, "loss": 3.371719503775239e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 466.875, "completions/min_length": 416.0, "epoch": 2.0176470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 1.593306064605713, "kl": 0.0045120301656425, "learning_rate": 9.999976284129663e-07, "loss": 4.500299473875202e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 434.375, "completions/min_length": 402.0, "epoch": 2.0191176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.080179214477539, "kl": 0.00452833907911554, "learning_rate": 9.999972166794884e-07, "loss": 4.5147819037083536e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 432.5625, "completions/min_length": 335.0, "epoch": 2.0205882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.209532618522644, "kl": 0.0033595318091101944, "learning_rate": 9.99996772007459e-07, "loss": 3.343154821777716e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 475.4375, "completions/min_length": 428.0, "epoch": 2.0220588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.0619299411773682, "kl": 0.0041910654399544, "learning_rate": 9.999962943969078e-07, "loss": 4.199113755021244e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 461.1875, "completions/min_length": 339.0, "epoch": 2.023529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.0343183279037476, "kl": 0.0034600390936248004, "learning_rate": 9.999957838478658e-07, "loss": 3.463029861450195e-05, "reward": 0.8657500147819519, "reward_std": 0.18726228177547455, "rewards/DrugCombAccuracyCOTORM/mean": 0.8400000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.3471023142337799, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.13437095284461975, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 442.1875, "completions/min_length": 389.0, "epoch": 2.025, "frac_reward_zero_std": 1.0, "grad_norm": 0.007170557510107756, "kl": 0.0025732393842190504, "learning_rate": 9.999952403603673e-07, "loss": 2.565193790360354e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 407.1875, "completions/min_length": 329.0, "epoch": 2.026470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.008615248836576939, "kl": 0.003173477540258318, "learning_rate": 9.999946639344474e-07, "loss": 3.1728905014460906e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 460.25, "completions/min_length": 372.0, "epoch": 2.027941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.009888083674013615, "kl": 0.00349319790257141, "learning_rate": 9.999940545701444e-07, "loss": 3.431918958085589e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 1379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 436.4375, "completions/min_length": 374.0, "epoch": 2.0294117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.025718705728650093, "kl": 0.003909947641659528, "learning_rate": 9.999934122674985e-07, "loss": 3.927609941456467e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 433.9375, "completions/min_length": 383.0, "epoch": 2.0308823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.621826171875, "kl": 0.003764769120607525, "learning_rate": 9.999927370265517e-07, "loss": 3.752865086426027e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/mean_length": 468.25, "completions/min_length": 364.0, "epoch": 2.0323529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.1619584560394287, "kl": 0.009156915009953082, "learning_rate": 9.99992028847349e-07, "loss": 8.386133413296193e-05, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 1382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 471.5, "completions/min_length": 415.0, "epoch": 2.0338235294117646, "frac_reward_zero_std": 0.0, "grad_norm": 1.4265779256820679, "kl": 0.003130842582322657, "learning_rate": 9.999912877299366e-07, "loss": 3.106892108917236e-05, "reward": 0.37558335065841675, "reward_std": 0.27045390009880066, "rewards/DrugCombAccuracyCOTORM/mean": 0.26374998688697815, "rewards/DrugCombAccuracyCOTORM/std": 0.44237053394317627, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6458333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5896483659744263, "step": 1383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/mean_length": 462.625, "completions/min_length": 397.0, "epoch": 2.0352941176470587, "frac_reward_zero_std": 0.5, "grad_norm": 1.0684211254119873, "kl": 0.004293719946872443, "learning_rate": 9.999905136743633e-07, "loss": 4.328787326812744e-05, "reward": 0.574999988079071, "reward_std": 0.17320507764816284, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.25, "rewards/DrugCombCoverageCOTORM/std": 0.9309493899345398, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 445.125, "completions/min_length": 394.0, "epoch": 2.036764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.01167229749262333, "kl": 0.0033246648963540792, "learning_rate": 9.999897066806806e-07, "loss": 3.3357042411807925e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 451.75, "completions/min_length": 412.0, "epoch": 2.038235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01557033509016037, "kl": 0.0033751161536201835, "learning_rate": 9.999888667489413e-07, "loss": 3.38395475409925e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 468.1875, "completions/min_length": 371.0, "epoch": 2.039705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.272436499595642, "kl": 0.004461202071979642, "learning_rate": 9.999879938792006e-07, "loss": 4.478543996810913e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 478.5, "completions/min_length": 435.0, "epoch": 2.041176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.219902515411377, "kl": 0.00374719372484833, "learning_rate": 9.999870880715162e-07, "loss": 3.7610530853271484e-05, "reward": 0.7020833492279053, "reward_std": 0.17244449257850647, "rewards/DrugCombAccuracyCOTORM/mean": 0.6666666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.43461349606513977, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 509.25, "completions/min_length": 452.0, "epoch": 2.0426470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.009467680007219315, "kl": 0.003487607929855585, "learning_rate": 9.999861493259478e-07, "loss": 3.490910603431985e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 1389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 476.1875, "completions/min_length": 412.0, "epoch": 2.0441176470588234, "frac_reward_zero_std": 0.5, "grad_norm": 0.9699041843414307, "kl": 0.004315496131312102, "learning_rate": 9.999851776425574e-07, "loss": 4.278836422599852e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 434.0, "completions/min_length": 382.0, "epoch": 2.0455882352941175, "frac_reward_zero_std": 1.0, "grad_norm": 0.0075159380212426186, "kl": 0.002859062806237489, "learning_rate": 9.999841730214084e-07, "loss": 2.8373435270623304e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 1391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 468.1875, "completions/min_length": 421.0, "epoch": 2.0470588235294116, "frac_reward_zero_std": 0.0, "grad_norm": 1.3264636993408203, "kl": 0.003177370992489159, "learning_rate": 9.999831354625675e-07, "loss": 3.1888484954833984e-05, "reward": 0.18125000596046448, "reward_std": 0.3343539237976074, "rewards/DrugCombAccuracyCOTORM/mean": 0.125, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.1875, "rewards/DrugCombCoverageCOTORM/std": 0.75, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 429.6875, "completions/min_length": 354.0, "epoch": 2.0485294117647057, "frac_reward_zero_std": 1.0, "grad_norm": 0.008846497163176537, "kl": 0.002818849461618811, "learning_rate": 9.999820649661032e-07, "loss": 2.7730142392101698e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/mean_length": 523.0, "completions/min_length": 403.0, "epoch": 2.05, "frac_reward_zero_std": 0.0, "grad_norm": 1.5212398767471313, "kl": 0.010877642314881086, "learning_rate": 9.999809615320856e-07, "loss": 0.00010943412780761719, "reward": 0.5916041731834412, "reward_std": 0.2682040333747864, "rewards/DrugCombAccuracyCOTORM/mean": 0.49731773138046265, "rewards/DrugCombAccuracyCOTORM/std": 0.43286219239234924, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.17078250646591187, "step": 1394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 432.375, "completions/min_length": 368.0, "epoch": 2.051470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.02790961228311062, "kl": 0.003981541260145605, "learning_rate": 9.999798251605876e-07, "loss": 3.991055200458504e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 476.625, "completions/min_length": 430.0, "epoch": 2.052941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.371084451675415, "kl": 0.006273309933021665, "learning_rate": 9.99978655851684e-07, "loss": 6.170073174871504e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 464.9375, "completions/min_length": 426.0, "epoch": 2.054411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.1054900884628296, "kl": 0.005687382537871599, "learning_rate": 9.999774536054517e-07, "loss": 5.742162466049194e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 452.1875, "completions/min_length": 380.0, "epoch": 2.0558823529411763, "frac_reward_zero_std": 1.0, "grad_norm": 0.08115062117576599, "kl": 0.00454046786762774, "learning_rate": 9.9997621842197e-07, "loss": 4.5686181692872196e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 447.75, "completions/min_length": 383.0, "epoch": 2.0573529411764704, "frac_reward_zero_std": 0.5, "grad_norm": 1.2211415767669678, "kl": 0.0030983270844444633, "learning_rate": 9.999749503013205e-07, "loss": 3.08305025100708e-05, "reward": 0.13508333265781403, "reward_std": 0.07931926846504211, "rewards/DrugCombAccuracyCOTORM/mean": 0.04124999791383743, "rewards/DrugCombAccuracyCOTORM/std": 0.08868484199047089, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0208333358168602, "rewards/DrugCombCoverageCOTORM/std": 0.589648425579071, "step": 1399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 435.75, "completions/min_length": 372.0, "epoch": 2.0588235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.016334019601345062, "kl": 0.003452216798905283, "learning_rate": 9.999736492435865e-07, "loss": 3.4443110052961856e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 465.3125, "completions/min_length": 414.0, "epoch": 2.0602941176470586, "frac_reward_zero_std": 0.5, "grad_norm": 1.2106256484985352, "kl": 0.0038993213674984872, "learning_rate": 9.999723152488536e-07, "loss": 3.871559601975605e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/mean_length": 510.5, "completions/min_length": 449.0, "epoch": 2.0617647058823527, "frac_reward_zero_std": 0.5, "grad_norm": 1.0285255908966064, "kl": 0.004181364376563579, "learning_rate": 9.999709483172103e-07, "loss": 4.161713150097057e-05, "reward": 0.7620657086372375, "reward_std": 0.15892332792282104, "rewards/DrugCombAccuracyCOTORM/mean": 0.7221133708953857, "rewards/DrugCombAccuracyCOTORM/std": 0.3823261857032776, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.23741470277309418, "step": 1402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 464.75, "completions/min_length": 437.0, "epoch": 2.0632352941176473, "frac_reward_zero_std": 1.0, "grad_norm": 0.012479566968977451, "kl": 0.003262235375586897, "learning_rate": 9.999695484487458e-07, "loss": 3.274583650636487e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 476.25, "completions/min_length": 396.0, "epoch": 2.0647058823529414, "frac_reward_zero_std": 0.0, "grad_norm": 1.369476079940796, "kl": 0.0034525758237577975, "learning_rate": 9.999681156435528e-07, "loss": 3.4786760807037354e-05, "reward": 0.737500011920929, "reward_std": 0.3709394931793213, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 405.125, "completions/min_length": 316.0, "epoch": 2.0661764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.01101153064519167, "kl": 0.003956491069402546, "learning_rate": 9.999666499017256e-07, "loss": 3.936118082492612e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 500.8125, "completions/min_length": 449.0, "epoch": 2.0676470588235296, "frac_reward_zero_std": 0.5, "grad_norm": 1.3949105739593506, "kl": 0.004413402930367738, "learning_rate": 9.999651512233607e-07, "loss": 4.408508539199829e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 411.375, "completions/min_length": 312.0, "epoch": 2.0691176470588237, "frac_reward_zero_std": 0.5, "grad_norm": 1.2920434474945068, "kl": 0.005716257233871147, "learning_rate": 9.99963619608557e-07, "loss": 5.3572024626191705e-05, "reward": 0.4375, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 1407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/mean_length": 525.125, "completions/min_length": 445.0, "epoch": 2.070588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9260354042053223, "kl": 0.003399033157620579, "learning_rate": 9.999620550574153e-07, "loss": 3.44105064868927e-05, "reward": 0.925000011920929, "reward_std": 0.1035098284482956, "rewards/DrugCombAccuracyCOTORM/mean": 0.90625, "rewards/DrugCombAccuracyCOTORM/std": 0.20155644416809082, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 447.3125, "completions/min_length": 371.0, "epoch": 2.072058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.264499306678772, "kl": 0.0049876937991939485, "learning_rate": 9.999604575700385e-07, "loss": 5.014985799789429e-05, "reward": 0.8553333282470703, "reward_std": 0.20654159784317017, "rewards/DrugCombAccuracyCOTORM/mean": 0.8400000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.3471022844314575, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5018484592437744, "step": 1409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 451.5, "completions/min_length": 380.0, "epoch": 2.073529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.011368215084075928, "kl": 0.004766447760630399, "learning_rate": 9.999588271465322e-07, "loss": 4.731646185973659e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 493.875, "completions/min_length": 440.0, "epoch": 2.075, "frac_reward_zero_std": 0.5, "grad_norm": 0.8713778257369995, "kl": 0.003733098797965795, "learning_rate": 9.999571637870034e-07, "loss": 3.771483898162842e-05, "reward": 0.21250000596046448, "reward_std": 0.1642080694437027, "rewards/DrugCombAccuracyCOTORM/mean": 0.0625, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 1411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 462.1875, "completions/min_length": 365.0, "epoch": 2.0764705882352943, "frac_reward_zero_std": 0.0, "grad_norm": 1.4539213180541992, "kl": 0.003278022399172187, "learning_rate": 9.999554674915621e-07, "loss": 3.246217966079712e-05, "reward": 0.59375, "reward_std": 0.3005203902721405, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 504.0625, "completions/min_length": 452.0, "epoch": 2.0779411764705884, "frac_reward_zero_std": 0.0, "grad_norm": 1.5437909364700317, "kl": 0.003547518572304398, "learning_rate": 9.999537382603198e-07, "loss": 3.5468488931655884e-05, "reward": 0.7699375152587891, "reward_std": 0.3705611824989319, "rewards/DrugCombAccuracyCOTORM/mean": 0.7339062690734863, "rewards/DrugCombAccuracyCOTORM/std": 0.4135919511318207, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.828125, "rewards/DrugCombCoverageCOTORM/std": 0.33811673521995544, "step": 1413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 452.75, "completions/min_length": 407.0, "epoch": 2.0794117647058825, "frac_reward_zero_std": 0.5, "grad_norm": 1.1452465057373047, "kl": 0.003358329413458705, "learning_rate": 9.999519760933903e-07, "loss": 3.3542513847351074e-05, "reward": 0.7687023878097534, "reward_std": 0.1989324539899826, "rewards/DrugCombAccuracyCOTORM/mean": 0.736919641494751, "rewards/DrugCombAccuracyCOTORM/std": 0.4231080412864685, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.4013864994049072, "step": 1414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 448.8125, "completions/min_length": 403.0, "epoch": 2.0808823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.006790085230022669, "kl": 0.002756188448984176, "learning_rate": 9.9995018099089e-07, "loss": 2.7244514058111235e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 427.3125, "completions/min_length": 384.0, "epoch": 2.0823529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.011215949431061745, "kl": 0.0036999585572630167, "learning_rate": 9.999483529529368e-07, "loss": 3.6980138247599825e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 448.375, "completions/min_length": 380.0, "epoch": 2.083823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.041386425495147705, "kl": 0.005213220603764057, "learning_rate": 9.999464919796513e-07, "loss": 5.243597843218595e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 463.9375, "completions/min_length": 373.0, "epoch": 2.085294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.4936405420303345, "kl": 0.005035055743064731, "learning_rate": 9.999445980711562e-07, "loss": 4.972517490386963e-05, "reward": 0.9551249742507935, "reward_std": 0.12692566215991974, "rewards/DrugCombAccuracyCOTORM/mean": 0.9478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.20874999463558197, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 1418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/mean_length": 512.0625, "completions/min_length": 433.0, "epoch": 2.086764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9633073806762695, "kl": 0.003428085590712726, "learning_rate": 9.999426712275764e-07, "loss": 3.40092447004281e-05, "reward": 0.9324333667755127, "reward_std": 0.1386796087026596, "rewards/DrugCombAccuracyCOTORM/mean": 0.924916684627533, "rewards/DrugCombAccuracyCOTORM/std": 0.22720548510551453, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.925000011920929, "rewards/DrugCombCoverageCOTORM/std": 0.20816659927368164, "step": 1419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/mean_length": 398.3125, "completions/min_length": 309.0, "epoch": 2.088235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.3161495923995972, "kl": 0.004653317620977759, "learning_rate": 9.999407114490383e-07, "loss": 4.645266380975954e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 455.75, "completions/min_length": 390.0, "epoch": 2.0897058823529413, "frac_reward_zero_std": 0.5, "grad_norm": 1.0460083484649658, "kl": 0.00366757734445855, "learning_rate": 9.999387187356716e-07, "loss": 3.640539580374025e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 446.4375, "completions/min_length": 395.0, "epoch": 2.0911764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 0.8612244129180908, "kl": 0.0033909952035173774, "learning_rate": 9.99936693087607e-07, "loss": 3.388632831047289e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/mean_length": 510.125, "completions/min_length": 438.0, "epoch": 2.0926470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.0477770566940308, "kl": 0.003950479265768081, "learning_rate": 9.999346345049786e-07, "loss": 3.960728645324707e-05, "reward": 0.9551249742507935, "reward_std": 0.12692566215991974, "rewards/DrugCombAccuracyCOTORM/mean": 0.9478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.20874999463558197, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 1423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 411.1875, "completions/min_length": 330.0, "epoch": 2.0941176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.1770501136779785, "kl": 0.003540400299243629, "learning_rate": 9.999325429879215e-07, "loss": 3.540005855029449e-05, "reward": 0.585812509059906, "reward_std": 0.17963135242462158, "rewards/DrugCombAccuracyCOTORM/mean": 0.5779687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.4977610111236572, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.234375, "rewards/DrugCombCoverageCOTORM/std": 0.9893969297409058, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 467.0625, "completions/min_length": 407.0, "epoch": 2.0955882352941178, "frac_reward_zero_std": 0.5, "grad_norm": 0.9947624802589417, "kl": 0.005110167257953435, "learning_rate": 9.999304185365735e-07, "loss": 5.039173265686259e-05, "reward": 0.550000011920929, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/mean_length": 517.625, "completions/min_length": 458.0, "epoch": 2.097058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.9631709456443787, "kl": 0.003412134712561965, "learning_rate": 9.999282611510748e-07, "loss": 3.4258198866155e-05, "reward": 0.9151785969734192, "reward_std": 0.1502397209405899, "rewards/DrugCombAccuracyCOTORM/mean": 0.9017857313156128, "rewards/DrugCombAccuracyCOTORM/std": 0.24863573908805847, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 457.3125, "completions/min_length": 421.0, "epoch": 2.098529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.012403354980051517, "kl": 0.0034967908286489546, "learning_rate": 9.999260708315673e-07, "loss": 3.5256511182524264e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/mean_length": 490.5, "completions/min_length": 375.0, "epoch": 2.1, "frac_reward_zero_std": 0.5, "grad_norm": 1.0085002183914185, "kl": 0.003603865741752088, "learning_rate": 9.999238475781957e-07, "loss": 3.591179847717285e-05, "reward": 0.23125000298023224, "reward_std": 0.15492895245552063, "rewards/DrugCombAccuracyCOTORM/mean": 0.125, "rewards/DrugCombAccuracyCOTORM/std": 0.2687419354915619, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3125, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 475.0625, "completions/min_length": 436.0, "epoch": 2.101470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.009316819719970226, "kl": 0.003711850557010621, "learning_rate": 9.999215913911058e-07, "loss": 3.6924247979186475e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 421.75, "completions/min_length": 358.0, "epoch": 2.1029411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.1460325717926025, "kl": 0.004588572424836457, "learning_rate": 9.99919302270447e-07, "loss": 4.5595243136631325e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 475.3125, "completions/min_length": 420.0, "epoch": 2.1044117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.7422212958335876, "kl": 0.004313677200116217, "learning_rate": 9.999169802163694e-07, "loss": 4.3272972106933594e-05, "reward": 0.831250011920929, "reward_std": 0.10415475070476532, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/mean_length": 486.25, "completions/min_length": 431.0, "epoch": 2.1058823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.0210151672363281, "kl": 0.0034088532556779683, "learning_rate": 9.999146252290262e-07, "loss": 3.4137414331780747e-05, "reward": 0.8964166641235352, "reward_std": 0.19718943536281586, "rewards/DrugCombAccuracyCOTORM/mean": 0.8887500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.30663496255874634, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5013870000839233, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/mean_length": 551.3125, "completions/min_length": 486.0, "epoch": 2.1073529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 0.9961823225021362, "kl": 0.004148934385739267, "learning_rate": 9.999122373085728e-07, "loss": 4.155933856964111e-05, "reward": 0.6234542727470398, "reward_std": 0.052191220223903656, "rewards/DrugCombAccuracyCOTORM/mean": 0.5449428558349609, "rewards/DrugCombAccuracyCOTORM/std": 0.47757914662361145, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.16124515235424042, "step": 1433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 412.375, "completions/min_length": 363.0, "epoch": 2.1088235294117648, "frac_reward_zero_std": 0.5, "grad_norm": 0.9261340498924255, "kl": 0.003594701993279159, "learning_rate": 9.999098164551663e-07, "loss": 3.5703182220458984e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/mean_length": 431.6875, "completions/min_length": 396.0, "epoch": 2.110294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.011676307767629623, "kl": 0.004393597017042339, "learning_rate": 9.999073626689663e-07, "loss": 4.4081851228838786e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 435.3125, "completions/min_length": 333.0, "epoch": 2.111764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0471290349960327, "kl": 0.004384531581308693, "learning_rate": 9.999048759501344e-07, "loss": 4.366676876088604e-05, "reward": 0.9619500041007996, "reward_std": 0.10762164741754532, "rewards/DrugCombAccuracyCOTORM/mean": 0.9539999961853027, "rewards/DrugCombAccuracyCOTORM/std": 0.18400000035762787, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.987500011920929, "rewards/DrugCombCoverageCOTORM/std": 0.05000000074505806, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 470.75, "completions/min_length": 405.0, "epoch": 2.113235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01325263362377882, "kl": 0.0034668464795686305, "learning_rate": 9.99902356298834e-07, "loss": 3.473301330814138e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/mean_length": 410.9375, "completions/min_length": 374.0, "epoch": 2.114705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.025907672941684723, "kl": 0.004145028884522617, "learning_rate": 9.998998037152318e-07, "loss": 4.1578674427000806e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 453.8125, "completions/min_length": 399.0, "epoch": 2.1161764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.5666981935501099, "kl": 0.0037382448790594935, "learning_rate": 9.998972181994955e-07, "loss": 3.6969780921936035e-05, "reward": 0.7875000238418579, "reward_std": 0.3934735357761383, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 1439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/mean_length": 424.5625, "completions/min_length": 375.0, "epoch": 2.1176470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.0633796453475952, "kl": 0.002629081951454282, "learning_rate": 9.998945997517955e-07, "loss": 2.6300549507141113e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 416.0625, "completions/min_length": 348.0, "epoch": 2.1191176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.021709447726607323, "kl": 0.004013834579382092, "learning_rate": 9.998919483723046e-07, "loss": 4.019068001070991e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 438.875, "completions/min_length": 362.0, "epoch": 2.1205882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.1742016077041626, "kl": 0.004330729483626783, "learning_rate": 9.998892640611968e-07, "loss": 4.2926520109176636e-05, "reward": 0.6299999952316284, "reward_std": 0.05796550586819649, "rewards/DrugCombAccuracyCOTORM/mean": 0.5687500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.45213383436203003, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.3333333432674408, "step": 1442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 478.3125, "completions/min_length": 429.0, "epoch": 2.1220588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9637528657913208, "kl": 0.003989757504314184, "learning_rate": 9.998865468186498e-07, "loss": 3.9637088775634766e-05, "reward": 0.9666666984558105, "reward_std": 0.061721328645944595, "rewards/DrugCombAccuracyCOTORM/mean": 0.9583333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.11385500431060791, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 484.8125, "completions/min_length": 429.0, "epoch": 2.123529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.024427056312561, "kl": 0.004285388276912272, "learning_rate": 9.99883796644842e-07, "loss": 4.266202449798584e-05, "reward": 0.4375, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 424.3125, "completions/min_length": 381.0, "epoch": 2.125, "frac_reward_zero_std": 1.0, "grad_norm": 0.008064784109592438, "kl": 0.002952383190859109, "learning_rate": 9.998810135399545e-07, "loss": 2.955576201202348e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/mean_length": 484.625, "completions/min_length": 397.0, "epoch": 2.126470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9089515805244446, "kl": 0.0031810831860639155, "learning_rate": 9.998781975041711e-07, "loss": 3.1694769859313965e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 407.0, "completions/min_length": 313.0, "epoch": 2.1279411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.009403145872056484, "kl": 0.0034254472120665014, "learning_rate": 9.99875348537677e-07, "loss": 3.3975335099967197e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/mean_length": 459.625, "completions/min_length": 369.0, "epoch": 2.1294117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.8466707468032837, "kl": 0.003203646861948073, "learning_rate": 9.9987246664066e-07, "loss": 3.188025584677234e-05, "reward": 0.6817708015441895, "reward_std": 0.15936285257339478, "rewards/DrugCombAccuracyCOTORM/mean": 0.6041666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.4901813864707947, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.0625, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 466.8125, "completions/min_length": 424.0, "epoch": 2.1308823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.0104044675827026, "kl": 0.0037791093927808106, "learning_rate": 9.998695518133097e-07, "loss": 3.771290721488185e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 451.3125, "completions/min_length": 391.0, "epoch": 2.1323529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.010286845266819, "kl": 0.0032265745103359222, "learning_rate": 9.998666040558186e-07, "loss": 3.213250602129847e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 443.625, "completions/min_length": 396.0, "epoch": 2.1338235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9059488773345947, "kl": 0.003352369589265436, "learning_rate": 9.998636233683806e-07, "loss": 3.3568219805601984e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 427.9375, "completions/min_length": 362.0, "epoch": 2.135294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.038170337677002, "kl": 0.00416739092906937, "learning_rate": 9.99860609751192e-07, "loss": 4.127621650695801e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 451.125, "completions/min_length": 387.0, "epoch": 2.136764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.012546072714030743, "kl": 0.0036132733803242445, "learning_rate": 9.998575632044514e-07, "loss": 3.625576209742576e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 484.875, "completions/min_length": 394.0, "epoch": 2.138235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0163484811782837, "kl": 0.003004994534421712, "learning_rate": 9.998544837283595e-07, "loss": 3.0179715395206586e-05, "reward": 0.5, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 470.8125, "completions/min_length": 428.0, "epoch": 2.139705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.019790049642324448, "kl": 0.004180950054433197, "learning_rate": 9.998513713231194e-07, "loss": 4.1978262743214145e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 427.1875, "completions/min_length": 322.0, "epoch": 2.1411764705882352, "frac_reward_zero_std": 1.0, "grad_norm": 0.006347910035401583, "kl": 0.0028817448765039444, "learning_rate": 9.998482259889358e-07, "loss": 2.901078732975293e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 444.625, "completions/min_length": 362.0, "epoch": 2.1426470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 0.7686663866043091, "kl": 0.0027835718356072903, "learning_rate": 9.99845047726016e-07, "loss": 2.7873615181306377e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 1457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 430.0625, "completions/min_length": 373.0, "epoch": 2.1441176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.0432907342910767, "kl": 0.005466762697324157, "learning_rate": 9.998418365345693e-07, "loss": 5.3610539907822385e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 447.3125, "completions/min_length": 394.0, "epoch": 2.1455882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.007707768585532904, "kl": 0.0032665624748915434, "learning_rate": 9.998385924148077e-07, "loss": 3.2612566428724676e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 472.1875, "completions/min_length": 399.0, "epoch": 2.1470588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.009409922175109386, "kl": 0.0032196242827922106, "learning_rate": 9.998353153669442e-07, "loss": 3.239704528823495e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 464.1875, "completions/min_length": 404.0, "epoch": 2.148529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.08651986718177795, "kl": 0.0071463375934399664, "learning_rate": 9.998320053911953e-07, "loss": 7.17020157026127e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 445.6875, "completions/min_length": 358.0, "epoch": 2.15, "frac_reward_zero_std": 1.0, "grad_norm": 0.007209867238998413, "kl": 0.0030250324634835124, "learning_rate": 9.998286624877785e-07, "loss": 3.0376231734408066e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 484.3125, "completions/min_length": 433.0, "epoch": 2.151470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.1985876560211182, "kl": 0.0036718989722430706, "learning_rate": 9.998252866569145e-07, "loss": 3.6739715142175555e-05, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 1463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 444.75, "completions/min_length": 379.0, "epoch": 2.152941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.1090465784072876, "kl": 0.004991052148398012, "learning_rate": 9.998218778988256e-07, "loss": 4.9697107897372916e-05, "reward": 0.737500011920929, "reward_std": 0.2183542549610138, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 455.0625, "completions/min_length": 373.0, "epoch": 2.1544117647058822, "frac_reward_zero_std": 0.5, "grad_norm": 1.2056124210357666, "kl": 0.004222299612592906, "learning_rate": 9.998184362137364e-07, "loss": 4.185397847322747e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 467.8125, "completions/min_length": 387.0, "epoch": 2.1558823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.9388964176177979, "kl": 0.0042325115646235645, "learning_rate": 9.998149616018731e-07, "loss": 4.2414903873577714e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 413.3125, "completions/min_length": 358.0, "epoch": 2.1573529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 700.9381103515625, "kl": 3.998389249725733, "learning_rate": 9.998114540634652e-07, "loss": 0.043872542679309845, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 460.625, "completions/min_length": 404.0, "epoch": 2.1588235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 1.1975374221801758, "kl": 0.003947924007661641, "learning_rate": 9.998079135987436e-07, "loss": 3.919037771993317e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 439.6875, "completions/min_length": 393.0, "epoch": 2.1602941176470587, "frac_reward_zero_std": 1.0, "grad_norm": 0.012535938061773777, "kl": 0.003606850979849696, "learning_rate": 9.998043402079415e-07, "loss": 3.5746776120504364e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/mean_length": 521.5625, "completions/min_length": 447.0, "epoch": 2.161764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.973619282245636, "kl": 0.003099790366832167, "learning_rate": 9.998007338912944e-07, "loss": 3.109872341156006e-05, "reward": 0.21011459827423096, "reward_std": 0.12689705193042755, "rewards/DrugCombAccuracyCOTORM/mean": 0.1509895920753479, "rewards/DrugCombAccuracyCOTORM/std": 0.2621614634990692, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.1067708432674408, "rewards/DrugCombCoverageCOTORM/std": 0.9415767788887024, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 446.8125, "completions/min_length": 360.0, "epoch": 2.163235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.027196615934371948, "kl": 0.0035226373584009707, "learning_rate": 9.997970946490396e-07, "loss": 3.549038228811696e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 431.25, "completions/min_length": 382.0, "epoch": 2.164705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.0361759662628174, "kl": 0.004160558572039008, "learning_rate": 9.997934224814171e-07, "loss": 4.122818063478917e-05, "reward": 0.6479166746139526, "reward_std": 0.1423826515674591, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 443.3125, "completions/min_length": 384.0, "epoch": 2.166176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 1.4252476692199707, "kl": 0.0028769944328814745, "learning_rate": 9.99789717388669e-07, "loss": 2.882629632949829e-05, "reward": 0.9312499761581421, "reward_std": 0.18224450945854187, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 1473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 448.6875, "completions/min_length": 388.0, "epoch": 2.1676470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 0.9525002241134644, "kl": 0.005204824905376881, "learning_rate": 9.997859793710388e-07, "loss": 5.1838687795680016e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 412.375, "completions/min_length": 377.0, "epoch": 2.1691176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.016999613493680954, "kl": 0.0034066973021253943, "learning_rate": 9.997822084287731e-07, "loss": 3.42133644153364e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 466.8125, "completions/min_length": 350.0, "epoch": 2.1705882352941175, "frac_reward_zero_std": 1.0, "grad_norm": 0.011439497582614422, "kl": 0.0033733199234120548, "learning_rate": 9.997784045621204e-07, "loss": 3.335817746119574e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 446.8125, "completions/min_length": 386.0, "epoch": 2.1720588235294116, "frac_reward_zero_std": 1.0, "grad_norm": 0.010450201109051704, "kl": 0.0034539118641987443, "learning_rate": 9.997745677713312e-07, "loss": 3.448021016083658e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 461.8125, "completions/min_length": 398.0, "epoch": 2.1735294117647057, "frac_reward_zero_std": 0.5, "grad_norm": 0.859397828578949, "kl": 0.003162350185448304, "learning_rate": 9.997706980566582e-07, "loss": 3.1232397304847836e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 418.4375, "completions/min_length": 347.0, "epoch": 2.175, "frac_reward_zero_std": 0.5, "grad_norm": 1.134317398071289, "kl": 0.0038624918670393527, "learning_rate": 9.997667954183564e-07, "loss": 3.843051672447473e-05, "reward": 0.9178333282470703, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.9025000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.26642072200775146, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 1479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 503.8125, "completions/min_length": 434.0, "epoch": 2.176470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.6253074407577515, "kl": 0.004032818542327732, "learning_rate": 9.99762859856683e-07, "loss": 4.015117883682251e-05, "reward": 0.8500000238418579, "reward_std": 0.24832583963871002, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.3095695972442627, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 501.4375, "completions/min_length": 404.0, "epoch": 2.177941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.7812206149101257, "kl": 0.0025867239746730775, "learning_rate": 9.997588913718968e-07, "loss": 2.5602079404052347e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 506.6875, "completions/min_length": 453.0, "epoch": 2.179411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.5007712841033936, "kl": 0.004254082741681486, "learning_rate": 9.997548899642599e-07, "loss": 4.3082982301712036e-05, "reward": 0.7246875166893005, "reward_std": 0.20778970420360565, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.2713136672973633, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.746874988079071, "rewards/DrugCombCoverageCOTORM/std": 0.48835399746894836, "step": 1482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 426.5, "completions/min_length": 358.0, "epoch": 2.1808823529411763, "frac_reward_zero_std": 1.0, "grad_norm": 0.13730265200138092, "kl": 0.00594977056607604, "learning_rate": 9.997508556340354e-07, "loss": 5.9508362028282136e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 433.5, "completions/min_length": 369.0, "epoch": 2.1823529411764704, "frac_reward_zero_std": 0.5, "grad_norm": 1.033837914466858, "kl": 0.003476861456874758, "learning_rate": 9.997467883814893e-07, "loss": 3.4833043173421174e-05, "reward": 0.75, "reward_std": 0.26726123690605164, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 408.875, "completions/min_length": 338.0, "epoch": 2.1838235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 1.2103191614151, "kl": 0.004487605299800634, "learning_rate": 9.997426882068895e-07, "loss": 4.490972787607461e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 458.125, "completions/min_length": 405.0, "epoch": 2.185294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.1339772939682007, "kl": 0.0037423453177325428, "learning_rate": 9.99738555110506e-07, "loss": 3.746151924133301e-05, "reward": 0.949999988079071, "reward_std": 0.09258200973272324, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.17078252136707306, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/mean_length": 510.3125, "completions/min_length": 437.0, "epoch": 2.1867647058823527, "frac_reward_zero_std": 0.0, "grad_norm": 1.308735966682434, "kl": 0.0036636072327382863, "learning_rate": 9.997343890926113e-07, "loss": 3.6425888538360596e-05, "reward": 0.634215772151947, "reward_std": 0.2872942090034485, "rewards/DrugCombAccuracyCOTORM/mean": 0.5714155435562134, "rewards/DrugCombAccuracyCOTORM/std": 0.44691944122314453, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7708333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.28463754057884216, "step": 1487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 436.125, "completions/min_length": 369.0, "epoch": 2.1882352941176473, "frac_reward_zero_std": 1.0, "grad_norm": 0.022681837901473045, "kl": 0.00456111622042954, "learning_rate": 9.997301901534796e-07, "loss": 4.557479769573547e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 464.0625, "completions/min_length": 356.0, "epoch": 2.189705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.009947994723916054, "kl": 0.004021958680823445, "learning_rate": 9.997259582933877e-07, "loss": 4.011590499430895e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 414.25, "completions/min_length": 366.0, "epoch": 2.1911764705882355, "frac_reward_zero_std": 0.5, "grad_norm": 1.5162667036056519, "kl": 0.0037851730594411492, "learning_rate": 9.997216935126143e-07, "loss": 3.7670135498046875e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/mean_length": 529.6875, "completions/min_length": 387.0, "epoch": 2.1926470588235296, "frac_reward_zero_std": 0.5, "grad_norm": 0.8444780707359314, "kl": 0.003306328842882067, "learning_rate": 9.997173958114402e-07, "loss": 3.3132731914520264e-05, "reward": 0.7061644792556763, "reward_std": 0.12034695595502853, "rewards/DrugCombAccuracyCOTORM/mean": 0.6436430215835571, "rewards/DrugCombAccuracyCOTORM/std": 0.4137989282608032, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9125000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.17464250326156616, "step": 1491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 448.625, "completions/min_length": 404.0, "epoch": 2.1941176470588237, "frac_reward_zero_std": 0.5, "grad_norm": 0.8155151605606079, "kl": 0.0034843437606468797, "learning_rate": 9.99713065190149e-07, "loss": 3.483891487121582e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/mean_length": 538.25, "completions/min_length": 461.0, "epoch": 2.195588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.3899372816085815, "kl": 0.0034853620454669, "learning_rate": 9.997087016490255e-07, "loss": 3.485381603240967e-05, "reward": 0.6000000238418579, "reward_std": 0.2821190655231476, "rewards/DrugCombAccuracyCOTORM/mean": 0.53125, "rewards/DrugCombAccuracyCOTORM/std": 0.46435439586639404, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.547722578048706, "step": 1493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 431.25, "completions/min_length": 340.0, "epoch": 2.197058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.0255194902420044, "kl": 0.0029648016788996756, "learning_rate": 9.997043051883575e-07, "loss": 2.9340386390686035e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 423.6875, "completions/min_length": 362.0, "epoch": 2.198529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.011050098575651646, "kl": 0.0043478579027578235, "learning_rate": 9.996998758084343e-07, "loss": 4.361037281341851e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 439.875, "completions/min_length": 370.0, "epoch": 2.2, "frac_reward_zero_std": 0.5, "grad_norm": 1.5244542360305786, "kl": 0.004464895406272262, "learning_rate": 9.996954135095478e-07, "loss": 4.4465065002441406e-05, "reward": 0.8494499921798706, "reward_std": 0.20898984372615814, "rewards/DrugCombAccuracyCOTORM/mean": 0.8289999961853027, "rewards/DrugCombAccuracyCOTORM/std": 0.3686906695365906, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.862500011920929, "rewards/DrugCombCoverageCOTORM/std": 0.30740854144096375, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 424.375, "completions/min_length": 348.0, "epoch": 2.2014705882352943, "frac_reward_zero_std": 0.5, "grad_norm": 1.0338213443756104, "kl": 0.0037771434290334582, "learning_rate": 9.996909182919922e-07, "loss": 3.7941132177365944e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/mean_length": 434.25, "completions/min_length": 398.0, "epoch": 2.2029411764705884, "frac_reward_zero_std": 1.0, "grad_norm": 0.007165456656366587, "kl": 0.0030665372032672167, "learning_rate": 9.996863901560634e-07, "loss": 3.061678580706939e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 404.9375, "completions/min_length": 325.0, "epoch": 2.2044117647058825, "frac_reward_zero_std": 0.5, "grad_norm": 0.7405036091804504, "kl": 0.0029808120452798903, "learning_rate": 9.996818291020596e-07, "loss": 2.9759252356598154e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 439.5, "completions/min_length": 404.0, "epoch": 2.2058823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.029416263103485107, "kl": 0.004223365045618266, "learning_rate": 9.996772351302818e-07, "loss": 4.2366205889265984e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 423.375, "completions/min_length": 368.0, "epoch": 2.2073529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.018651451915502548, "kl": 0.00317030877340585, "learning_rate": 9.99672608241032e-07, "loss": 3.1939969630911946e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 445.4375, "completions/min_length": 380.0, "epoch": 2.208823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.026615858078003, "kl": 0.004168522544205189, "learning_rate": 9.996679484346153e-07, "loss": 4.132091999053955e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 488.75, "completions/min_length": 436.0, "epoch": 2.210294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9721981883049011, "kl": 0.0037504666252061725, "learning_rate": 9.996632557113388e-07, "loss": 3.789365291595459e-05, "reward": 0.8356666564941406, "reward_std": 0.17567972838878632, "rewards/DrugCombAccuracyCOTORM/mean": 0.8050000071525574, "rewards/DrugCombAccuracyCOTORM/std": 0.3488266170024872, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.14907118678092957, "step": 1503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 475.0625, "completions/min_length": 419.0, "epoch": 2.211764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0116833448410034, "kl": 0.003247816814109683, "learning_rate": 9.996585300715115e-07, "loss": 3.265589475631714e-05, "reward": 0.6875, "reward_std": 0.19594094157218933, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 392.0625, "completions/min_length": 328.0, "epoch": 2.213235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.509894609451294, "kl": 0.003667709475848824, "learning_rate": 9.996537715154446e-07, "loss": 3.70517373085022e-05, "reward": 0.4624999761581421, "reward_std": 0.1767766773700714, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 1505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 476.75, "completions/min_length": 399.0, "epoch": 2.2147058823529413, "frac_reward_zero_std": 0.0, "grad_norm": 1.483101725578308, "kl": 0.003995512903202325, "learning_rate": 9.996489800434517e-07, "loss": 3.9674341678619385e-05, "reward": 0.6395833492279053, "reward_std": 0.3773033916950226, "rewards/DrugCombAccuracyCOTORM/mean": 0.6041666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.4901813864707947, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 1506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/mean_length": 506.125, "completions/min_length": 413.0, "epoch": 2.2161764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 0.8496478796005249, "kl": 0.003021789132617414, "learning_rate": 9.996441556558484e-07, "loss": 3.0316412448883057e-05, "reward": 0.5062500238418579, "reward_std": 0.0176776684820652, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0625, "rewards/DrugCombCoverageCOTORM/std": 0.9979145526885986, "step": 1507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 447.25, "completions/min_length": 368.0, "epoch": 2.2176470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.009554977528750896, "kl": 0.003624156815931201, "learning_rate": 9.996392983529527e-07, "loss": 3.5871376894647256e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 454.75, "completions/min_length": 400.0, "epoch": 2.2191176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.007183290086686611, "kl": 0.003022205492015928, "learning_rate": 9.996344081350844e-07, "loss": 3.0259863706305623e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 480.9375, "completions/min_length": 391.0, "epoch": 2.2205882352941178, "frac_reward_zero_std": 0.0, "grad_norm": 1.6468702554702759, "kl": 0.003135594248306006, "learning_rate": 9.996294850025657e-07, "loss": 3.0875205993652344e-05, "reward": 0.6353332996368408, "reward_std": 0.10182337462902069, "rewards/DrugCombAccuracyCOTORM/mean": 0.5649999976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.4067103862762451, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 1510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 438.4375, "completions/min_length": 373.0, "epoch": 2.222058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.00550060486420989, "kl": 0.0029802085482515395, "learning_rate": 9.99624528955721e-07, "loss": 2.964212944789324e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/mean_length": 497.3125, "completions/min_length": 409.0, "epoch": 2.223529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.0154392719268799, "kl": 0.00487296940991655, "learning_rate": 9.996195399948766e-07, "loss": 4.8761736252345145e-05, "reward": 0.7777291536331177, "reward_std": 0.21036861836910248, "rewards/DrugCombAccuracyCOTORM/mean": 0.7566666603088379, "rewards/DrugCombAccuracyCOTORM/std": 0.39837446808815, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7395833730697632, "rewards/DrugCombCoverageCOTORM/std": 0.54081130027771, "step": 1512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/mean_length": 505.875, "completions/min_length": 379.0, "epoch": 2.225, "frac_reward_zero_std": 0.5, "grad_norm": 1.1096879243850708, "kl": 0.004044193308800459, "learning_rate": 9.996145181203615e-07, "loss": 3.953278064727783e-05, "reward": 0.7726095914840698, "reward_std": 0.11363177001476288, "rewards/DrugCombAccuracyCOTORM/mean": 0.7216213345527649, "rewards/DrugCombAccuracyCOTORM/std": 0.3463623523712158, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.953125, "rewards/DrugCombCoverageCOTORM/std": 0.0625, "step": 1513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 421.9375, "completions/min_length": 383.0, "epoch": 2.226470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9931000471115112, "kl": 0.0039056792738847435, "learning_rate": 9.99609463332506e-07, "loss": 3.912004467565566e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 490.4375, "completions/min_length": 434.0, "epoch": 2.2279411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.1520650386810303, "kl": 0.0032588354079052806, "learning_rate": 9.996043756316437e-07, "loss": 3.2804906368255615e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 448.375, "completions/min_length": 375.0, "epoch": 2.2294117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.007045714184641838, "kl": 0.0035876132897101343, "learning_rate": 9.995992550181094e-07, "loss": 3.5427434340817854e-05, "reward": 0.8416666984558105, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.8333333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.17213258147239685, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 1516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 437.875, "completions/min_length": 364.0, "epoch": 2.2308823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.018904326483607292, "kl": 0.003673371858894825, "learning_rate": 9.995941014922405e-07, "loss": 3.671353624667972e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/mean_length": 532.9375, "completions/min_length": 414.0, "epoch": 2.2323529411764707, "frac_reward_zero_std": 0.0, "grad_norm": 1.2505708932876587, "kl": 0.002831471851095557, "learning_rate": 9.995889150543764e-07, "loss": 2.8386712074279785e-05, "reward": 0.43494999408721924, "reward_std": 0.35704803466796875, "rewards/DrugCombAccuracyCOTORM/mean": 0.3499374985694885, "rewards/DrugCombAccuracyCOTORM/std": 0.47606030106544495, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.550000011920929, "rewards/DrugCombCoverageCOTORM/std": 0.5033223032951355, "step": 1518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 442.6875, "completions/min_length": 398.0, "epoch": 2.2338235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.01518233586102724, "kl": 0.003091333666816354, "learning_rate": 9.995836957048589e-07, "loss": 3.105807627434842e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 449.25, "completions/min_length": 413.0, "epoch": 2.235294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.8362950086593628, "kl": 0.0034860464511439204, "learning_rate": 9.995784434440318e-07, "loss": 3.4905970096588135e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 445.3125, "completions/min_length": 363.0, "epoch": 2.236764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.012581637129187584, "kl": 0.004066163732204586, "learning_rate": 9.995731582722412e-07, "loss": 4.062817606609315e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 450.3125, "completions/min_length": 371.0, "epoch": 2.238235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.008124154061079025, "kl": 0.0033642257913015783, "learning_rate": 9.995678401898353e-07, "loss": 3.3923086448339745e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 689.0, "completions/mean_length": 548.625, "completions/min_length": 441.0, "epoch": 2.239705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 1.3709471225738525, "kl": 0.0035259552532806993, "learning_rate": 9.995624891971642e-07, "loss": 3.505870699882507e-05, "reward": 0.7166666984558105, "reward_std": 0.28281551599502563, "rewards/DrugCombAccuracyCOTORM/mean": 0.6458333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.4121982753276825, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/mean_length": 521.6875, "completions/min_length": 407.0, "epoch": 2.2411764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.6649889945983887, "kl": 0.004023750370834023, "learning_rate": 9.995571052945806e-07, "loss": 4.048272967338562e-05, "reward": 0.5, "reward_std": 0.3562890887260437, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 1524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/mean_length": 472.875, "completions/min_length": 377.0, "epoch": 2.2426470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.015257606282830238, "kl": 0.004180057963822037, "learning_rate": 9.99551688482439e-07, "loss": 4.175679350737482e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/mean_length": 451.375, "completions/min_length": 348.0, "epoch": 2.2441176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.3849645853042603, "kl": 0.004840171313844621, "learning_rate": 9.995462387610965e-07, "loss": 4.929651186103001e-05, "reward": 0.5, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 444.9375, "completions/min_length": 396.0, "epoch": 2.2455882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.258590579032898, "kl": 0.004054026445373893, "learning_rate": 9.995407561309119e-07, "loss": 4.066923065693118e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/mean_length": 523.5625, "completions/min_length": 389.0, "epoch": 2.2470588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9527106285095215, "kl": 0.004245704796630889, "learning_rate": 9.995352405922466e-07, "loss": 4.295913822716102e-05, "reward": 0.6345972418785095, "reward_std": 0.12953896820545197, "rewards/DrugCombAccuracyCOTORM/mean": 0.5779687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.47842535376548767, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7222222089767456, "rewards/DrugCombCoverageCOTORM/std": 0.5301603078842163, "step": 1528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 437.0, "completions/min_length": 373.0, "epoch": 2.248529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.00848285760730505, "kl": 0.003547934291418642, "learning_rate": 9.995296921454638e-07, "loss": 3.5555323847802356e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 447.6875, "completions/min_length": 416.0, "epoch": 2.25, "frac_reward_zero_std": 1.0, "grad_norm": 0.01457426231354475, "kl": 0.0039971956866793334, "learning_rate": 9.99524110790929e-07, "loss": 4.044389788759872e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 458.3125, "completions/min_length": 342.0, "epoch": 2.251470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.2637251615524292, "kl": 0.004016303399112076, "learning_rate": 9.995184965290099e-07, "loss": 3.9760401705279946e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 448.1875, "completions/min_length": 359.0, "epoch": 2.2529411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.005868075881153345, "kl": 0.0028535937308333814, "learning_rate": 9.99512849360076e-07, "loss": 2.8597483833436854e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 457.25, "completions/min_length": 348.0, "epoch": 2.2544117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.02587743103504181, "kl": 0.004446102189831436, "learning_rate": 9.995071692845003e-07, "loss": 4.464853191166185e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 452.125, "completions/min_length": 398.0, "epoch": 2.2558823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.4008175134658813, "kl": 0.021271566045470536, "learning_rate": 9.99501456302656e-07, "loss": 0.00021677464246749878, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 471.25, "completions/min_length": 413.0, "epoch": 2.2573529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.0848042964935303, "kl": 0.004394300747662783, "learning_rate": 9.9949571041492e-07, "loss": 4.377588629722595e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 1535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 439.875, "completions/min_length": 393.0, "epoch": 2.2588235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.015368190594017506, "kl": 0.004730561515316367, "learning_rate": 9.994899316216706e-07, "loss": 4.7280507715186104e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 474.5, "completions/min_length": 394.0, "epoch": 2.260294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.3823446035385132, "kl": 0.003578873525839299, "learning_rate": 9.994841199232886e-07, "loss": 3.586709499359131e-05, "reward": 0.6707500219345093, "reward_std": 0.4084862470626831, "rewards/DrugCombAccuracyCOTORM/mean": 0.6353124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.4398944675922394, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.670820415019989, "step": 1537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 443.25, "completions/min_length": 333.0, "epoch": 2.261764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.0201917365193367, "kl": 0.004523164359852672, "learning_rate": 9.994782753201568e-07, "loss": 4.504656681092456e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 455.0625, "completions/min_length": 396.0, "epoch": 2.263235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.3027069568634033, "kl": 0.004033897828776389, "learning_rate": 9.994723978126602e-07, "loss": 4.0140002965927124e-05, "reward": 0.7719791531562805, "reward_std": 0.15476727485656738, "rewards/DrugCombAccuracyCOTORM/mean": 0.729296863079071, "rewards/DrugCombAccuracyCOTORM/std": 0.3741573989391327, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8854166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.17969882488250732, "step": 1539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 443.25, "completions/min_length": 379.0, "epoch": 2.264705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.007349156774580479, "kl": 0.003079279267694801, "learning_rate": 9.994664874011861e-07, "loss": 3.0496599720208906e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 428.25, "completions/min_length": 374.0, "epoch": 2.2661764705882352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0071157305501401424, "kl": 0.0029396413592621684, "learning_rate": 9.99460544086124e-07, "loss": 2.9553575586760417e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 447.75, "completions/min_length": 406.0, "epoch": 2.2676470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 1.2213549613952637, "kl": 0.003263559832703322, "learning_rate": 9.99454567867865e-07, "loss": 3.266980638727546e-05, "reward": 0.7302083373069763, "reward_std": 0.2515009641647339, "rewards/DrugCombAccuracyCOTORM/mean": 0.7291666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.4425306022167206, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.46875, "rewards/DrugCombCoverageCOTORM/std": 0.8844725489616394, "step": 1542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 761.0, "completions/mean_length": 550.0625, "completions/min_length": 437.0, "epoch": 2.2691176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.2758479118347168, "kl": 0.0036466101300902665, "learning_rate": 9.99448558746803e-07, "loss": 3.662332892417908e-05, "reward": 0.41806745529174805, "reward_std": 0.1674794703722, "rewards/DrugCombAccuracyCOTORM/mean": 0.30123016238212585, "rewards/DrugCombAccuracyCOTORM/std": 0.4221840798854828, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7708333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.5370530486106873, "step": 1543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 454.0, "completions/min_length": 377.0, "epoch": 2.2705882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.2096151113510132, "kl": 0.0041726138442754745, "learning_rate": 9.99442516723334e-07, "loss": 4.193931818008423e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 1544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 449.625, "completions/min_length": 391.0, "epoch": 2.2720588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.0729163885116577, "kl": 0.003986645780969411, "learning_rate": 9.994364417978558e-07, "loss": 3.968924283981323e-05, "reward": 0.8250000476837158, "reward_std": 0.0707106739282608, "rewards/DrugCombAccuracyCOTORM/mean": 0.78125, "rewards/DrugCombAccuracyCOTORM/std": 0.2561737895011902, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 457.6875, "completions/min_length": 417.0, "epoch": 2.273529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.010742592625319958, "kl": 0.003886301419697702, "learning_rate": 9.994303339707687e-07, "loss": 3.887587445206009e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 512.125, "completions/min_length": 399.0, "epoch": 2.275, "frac_reward_zero_std": 1.0, "grad_norm": 0.01292796153575182, "kl": 0.0038741338648833334, "learning_rate": 9.994241932424754e-07, "loss": 3.8548503653146327e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 488.1875, "completions/min_length": 443.0, "epoch": 2.276470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 2.605372667312622, "kl": 0.0034492318518459797, "learning_rate": 9.9941801961338e-07, "loss": 3.412365913391113e-05, "reward": 0.8999999761581421, "reward_std": 0.2828426957130432, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 434.75, "completions/min_length": 398.0, "epoch": 2.277941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.01485542580485344, "kl": 0.003494450938887894, "learning_rate": 9.994118130838892e-07, "loss": 3.515417847665958e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 475.5625, "completions/min_length": 403.0, "epoch": 2.2794117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.009161917492747307, "kl": 0.0029723694315180182, "learning_rate": 9.99405573654412e-07, "loss": 2.9886234187870286e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/mean_length": 471.0, "completions/min_length": 370.0, "epoch": 2.2808823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.166788935661316, "kl": 0.004875386308412999, "learning_rate": 9.993993013253596e-07, "loss": 4.843751230509952e-05, "reward": 0.909333348274231, "reward_std": 0.07998768240213394, "rewards/DrugCombAccuracyCOTORM/mean": 0.8970833420753479, "rewards/DrugCombAccuracyCOTORM/std": 0.1626761108636856, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.17213259637355804, "step": 1551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/mean_length": 482.0625, "completions/min_length": 336.0, "epoch": 2.2823529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.8435771465301514, "kl": 0.00402378651779145, "learning_rate": 9.99392996097145e-07, "loss": 4.028528928756714e-05, "reward": 0.922656238079071, "reward_std": 0.08501601964235306, "rewards/DrugCombAccuracyCOTORM/mean": 0.90625, "rewards/DrugCombAccuracyCOTORM/std": 0.1717960685491562, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9765625, "rewards/DrugCombCoverageCOTORM/std": 0.050389111042022705, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 469.8125, "completions/min_length": 381.0, "epoch": 2.2838235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 0.8539547920227051, "kl": 0.0031804170575924218, "learning_rate": 9.993866579701836e-07, "loss": 3.1941897759679705e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 1553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 442.9375, "completions/min_length": 380.0, "epoch": 2.2852941176470587, "frac_reward_zero_std": 1.0, "grad_norm": 0.01339790876954794, "kl": 0.0040635074255988, "learning_rate": 9.993802869448928e-07, "loss": 4.083547173650004e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 473.1875, "completions/min_length": 395.0, "epoch": 2.286764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.380638837814331, "kl": 0.0036710912827402353, "learning_rate": 9.993738830216927e-07, "loss": 3.6850571632385254e-05, "reward": 0.47911110520362854, "reward_std": 0.1835905760526657, "rewards/DrugCombAccuracyCOTORM/mean": 0.44437500834465027, "rewards/DrugCombAccuracyCOTORM/std": 0.5067934393882751, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.2361111044883728, "rewards/DrugCombCoverageCOTORM/std": 0.9061972498893738, "step": 1555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 425.875, "completions/min_length": 334.0, "epoch": 2.288235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.2509309649467468, "kl": 0.0054124994203448296, "learning_rate": 9.99367446201005e-07, "loss": 5.358960333978757e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 477.375, "completions/min_length": 431.0, "epoch": 2.289705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.1167470216751099, "kl": 0.004344743094407022, "learning_rate": 9.993609764832535e-07, "loss": 4.340290615800768e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 477.5, "completions/min_length": 425.0, "epoch": 2.291176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.1418110132217407, "kl": 0.003610160667449236, "learning_rate": 9.993544738688646e-07, "loss": 3.631429353845306e-05, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 1558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 438.875, "completions/min_length": 392.0, "epoch": 2.2926470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.009141440503299236, "kl": 0.003346049692481756, "learning_rate": 9.993479383582667e-07, "loss": 3.317690425319597e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 479.3125, "completions/min_length": 435.0, "epoch": 2.2941176470588234, "frac_reward_zero_std": 0.5, "grad_norm": 1.190354347229004, "kl": 0.003988367272540927, "learning_rate": 9.993413699518905e-07, "loss": 3.958725574193522e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 464.0625, "completions/min_length": 375.0, "epoch": 2.2955882352941175, "frac_reward_zero_std": 0.5, "grad_norm": 0.871239423751831, "kl": 0.003726984257809818, "learning_rate": 9.993347686501683e-07, "loss": 3.7550926208496094e-05, "reward": 0.9937499761581421, "reward_std": 0.017677659168839455, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/mean_length": 464.25, "completions/min_length": 376.0, "epoch": 2.2970588235294116, "frac_reward_zero_std": 0.5, "grad_norm": 1.0503603219985962, "kl": 0.003244302701205015, "learning_rate": 9.993281344535354e-07, "loss": 3.204107633791864e-05, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 1562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 460.5625, "completions/min_length": 399.0, "epoch": 2.2985294117647057, "frac_reward_zero_std": 0.5, "grad_norm": 0.9479824900627136, "kl": 0.0035791125847026706, "learning_rate": 9.993214673624284e-07, "loss": 3.5687833587871864e-05, "reward": 0.45000001788139343, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 1563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 481.4375, "completions/min_length": 411.0, "epoch": 2.3, "frac_reward_zero_std": 0.5, "grad_norm": 1.04146409034729, "kl": 0.003774834971409291, "learning_rate": 9.99314767377287e-07, "loss": 3.769744944293052e-05, "reward": 0.625, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.25, "rewards/DrugCombCoverageCOTORM/std": 1.0, "step": 1564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/mean_length": 472.5625, "completions/min_length": 387.0, "epoch": 2.301470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8711494207382202, "kl": 0.003813423914834857, "learning_rate": 9.99308034498552e-07, "loss": 3.853440284729004e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 452.5, "completions/min_length": 379.0, "epoch": 2.302941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.2857517004013062, "kl": 0.003212360490579158, "learning_rate": 9.993012687266675e-07, "loss": 3.215670585632324e-05, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 1566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 469.9375, "completions/min_length": 418.0, "epoch": 2.304411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.008371789008378983, "kl": 0.003802501072641462, "learning_rate": 9.992944700620792e-07, "loss": 3.802966239163652e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 479.375, "completions/min_length": 396.0, "epoch": 2.3058823529411763, "frac_reward_zero_std": 0.0, "grad_norm": 1.8292137384414673, "kl": 0.005419908673502505, "learning_rate": 9.992876385052344e-07, "loss": 5.605816841125488e-05, "reward": 0.2875000238418579, "reward_std": 0.32480600476264954, "rewards/DrugCombAccuracyCOTORM/mean": 0.1875, "rewards/DrugCombAccuracyCOTORM/std": 0.35939764976501465, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.7187952995300293, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/mean_length": 536.9375, "completions/min_length": 467.0, "epoch": 2.307352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.1537858247756958, "kl": 0.002533645776566118, "learning_rate": 9.992807740565835e-07, "loss": 2.5250017642974854e-05, "reward": 0.5833333134651184, "reward_std": 0.035634834319353104, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 1569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 488.0625, "completions/min_length": 428.0, "epoch": 2.3088235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 1.0358203649520874, "kl": 0.0037799422279931605, "learning_rate": 9.99273876716579e-07, "loss": 3.7768855690956116e-05, "reward": 0.6807470321655273, "reward_std": 0.1631864607334137, "rewards/DrugCombAccuracyCOTORM/mean": 0.6080952286720276, "rewards/DrugCombAccuracyCOTORM/std": 0.49096694588661194, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9427083730697632, "rewards/DrugCombCoverageCOTORM/std": 0.12441994994878769, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 453.625, "completions/min_length": 348.0, "epoch": 2.310294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.03240545094013214, "kl": 0.003539079858455807, "learning_rate": 9.99266946485675e-07, "loss": 3.542995546013117e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 416.8125, "completions/min_length": 355.0, "epoch": 2.3117647058823527, "frac_reward_zero_std": 1.0, "grad_norm": 0.009250749833881855, "kl": 0.003684523224364966, "learning_rate": 9.992599833643278e-07, "loss": 3.685925548779778e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/mean_length": 475.0, "completions/min_length": 396.0, "epoch": 2.3132352941176473, "frac_reward_zero_std": 1.0, "grad_norm": 0.012322126887738705, "kl": 0.004482711374294013, "learning_rate": 9.992529873529965e-07, "loss": 4.5325850805966184e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 1573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 441.625, "completions/min_length": 355.0, "epoch": 2.314705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.0919950008392334, "kl": 0.004562878108117729, "learning_rate": 9.99245958452142e-07, "loss": 4.571676254272461e-05, "reward": 0.6180000305175781, "reward_std": 0.02074180170893669, "rewards/DrugCombAccuracyCOTORM/mean": 0.5641666650772095, "rewards/DrugCombAccuracyCOTORM/std": 0.4515184462070465, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.3442651927471161, "step": 1574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 447.5625, "completions/min_length": 401.0, "epoch": 2.3161764705882355, "frac_reward_zero_std": 0.5, "grad_norm": 1.0442259311676025, "kl": 0.004952918679919094, "learning_rate": 9.992388966622268e-07, "loss": 4.952332528773695e-05, "reward": 0.8374999761581421, "reward_std": 0.22638462483882904, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 490.0625, "completions/min_length": 405.0, "epoch": 2.317647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.007375370245426893, "kl": 0.0032775462605059147, "learning_rate": 9.99231801983717e-07, "loss": 3.277562063885853e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 1576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 454.3125, "completions/min_length": 399.0, "epoch": 2.3191176470588237, "frac_reward_zero_std": 1.0, "grad_norm": 0.009782230481505394, "kl": 0.0036148756626062095, "learning_rate": 9.99224674417079e-07, "loss": 3.583424040698446e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 447.625, "completions/min_length": 378.0, "epoch": 2.320588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.4006670713424683, "kl": 0.0038525605923496187, "learning_rate": 9.992175139627832e-07, "loss": 3.857165575027466e-05, "reward": 0.7875000238418579, "reward_std": 0.3837963938713074, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 1578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 444.375, "completions/min_length": 396.0, "epoch": 2.322058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.6181291341781616, "kl": 0.004460421099793166, "learning_rate": 9.992103206213008e-07, "loss": 4.435284063220024e-05, "reward": 0.6687500476837158, "reward_std": 0.20517849922180176, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 1579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 458.875, "completions/min_length": 420.0, "epoch": 2.323529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.007566997315734625, "kl": 0.0032629233901388943, "learning_rate": 9.992030943931058e-07, "loss": 3.256147829233669e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 440.4375, "completions/min_length": 373.0, "epoch": 2.325, "frac_reward_zero_std": 1.0, "grad_norm": 0.010638381354510784, "kl": 0.0031979733030311763, "learning_rate": 9.991958352786744e-07, "loss": 3.170713898725808e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 479.0, "completions/min_length": 437.0, "epoch": 2.3264705882352943, "frac_reward_zero_std": 0.5, "grad_norm": 0.9749458432197571, "kl": 0.0039039516705088317, "learning_rate": 9.991885432784845e-07, "loss": 3.922657197108492e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 1582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 408.6875, "completions/min_length": 370.0, "epoch": 2.3279411764705884, "frac_reward_zero_std": 1.0, "grad_norm": 0.0059517608024179935, "kl": 0.002870387746952474, "learning_rate": 9.991812183930169e-07, "loss": 2.8919348551426083e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 439.0, "completions/min_length": 396.0, "epoch": 2.3294117647058825, "frac_reward_zero_std": 1.0, "grad_norm": 0.0076521229930222034, "kl": 0.0027363859699107707, "learning_rate": 9.991738606227536e-07, "loss": 2.735965244937688e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 493.6875, "completions/min_length": 432.0, "epoch": 2.3308823529411766, "frac_reward_zero_std": 0.0, "grad_norm": 1.4204394817352295, "kl": 0.0038074488984420896, "learning_rate": 9.991664699681798e-07, "loss": 3.834813833236694e-05, "reward": 0.8675416707992554, "reward_std": 0.20170412957668304, "rewards/DrugCombAccuracyCOTORM/mean": 0.8409374952316284, "rewards/DrugCombAccuracyCOTORM/std": 0.30590832233428955, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9479166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.11334558576345444, "step": 1585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 448.5625, "completions/min_length": 364.0, "epoch": 2.3323529411764707, "frac_reward_zero_std": 0.0, "grad_norm": 1.3905296325683594, "kl": 0.003755067358724773, "learning_rate": 9.991590464297822e-07, "loss": 3.737211227416992e-05, "reward": 0.6499999761581421, "reward_std": 0.3265853524208069, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 434.5625, "completions/min_length": 380.0, "epoch": 2.333823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.015008160844445229, "kl": 0.004038603859953582, "learning_rate": 9.991515900080496e-07, "loss": 3.9855622162576765e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 487.75, "completions/min_length": 433.0, "epoch": 2.335294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9301602840423584, "kl": 0.00678811885882169, "learning_rate": 9.991441007034738e-07, "loss": 6.62497550365515e-05, "reward": 0.925208330154419, "reward_std": 0.0641985684633255, "rewards/DrugCombAccuracyCOTORM/mean": 0.9208333492279053, "rewards/DrugCombAccuracyCOTORM/std": 0.12127409875392914, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8854166269302368, "rewards/DrugCombCoverageCOTORM/std": 0.29007503390312195, "step": 1588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 493.5, "completions/min_length": 412.0, "epoch": 2.336764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.9036263227462769, "kl": 0.003694346349220723, "learning_rate": 9.991365785165476e-07, "loss": 3.698468208312988e-05, "reward": 0.45625001192092896, "reward_std": 0.3661186993122101, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 1589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 469.9375, "completions/min_length": 403.0, "epoch": 2.338235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.006770751439034939, "kl": 0.0029246347839944065, "learning_rate": 9.991290234477666e-07, "loss": 2.9356498998822644e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 474.75, "completions/min_length": 423.0, "epoch": 2.3397058823529413, "frac_reward_zero_std": 1.0, "grad_norm": 0.006249665282666683, "kl": 0.003150977601762861, "learning_rate": 9.99121435497629e-07, "loss": 3.179747363901697e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 456.3125, "completions/min_length": 400.0, "epoch": 2.3411764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.011150944977998734, "kl": 0.004111309128347784, "learning_rate": 9.991138146666342e-07, "loss": 4.152500332565978e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 488.8125, "completions/min_length": 433.0, "epoch": 2.3426470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.010409919545054436, "kl": 0.003945799602661282, "learning_rate": 9.991061609552844e-07, "loss": 3.9472644857596606e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/mean_length": 534.125, "completions/min_length": 436.0, "epoch": 2.3441176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.5257623195648193, "kl": 0.0041543382103554904, "learning_rate": 9.990984743640838e-07, "loss": 4.1350722312927246e-05, "reward": 0.6499999761581421, "reward_std": 0.39218372106552124, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 454.0625, "completions/min_length": 389.0, "epoch": 2.3455882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.01104989554733038, "kl": 0.0035321558243595064, "learning_rate": 9.990907548935387e-07, "loss": 3.5107015719404444e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 520.0625, "completions/min_length": 464.0, "epoch": 2.347058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.1935851573944092, "kl": 0.0036975782713852823, "learning_rate": 9.99083002544158e-07, "loss": 3.688037395477295e-05, "reward": 0.910812497138977, "reward_std": 0.25226032733917236, "rewards/DrugCombAccuracyCOTORM/mean": 0.8904687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.30268827080726624, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.0625, "step": 1596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 414.4375, "completions/min_length": 369.0, "epoch": 2.348529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.006737248040735722, "kl": 0.0030124917975626886, "learning_rate": 9.990752173164518e-07, "loss": 3.0027884349692613e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 449.9375, "completions/min_length": 408.0, "epoch": 2.35, "frac_reward_zero_std": 0.5, "grad_norm": 1.07859468460083, "kl": 0.0037346130702644587, "learning_rate": 9.990673992109333e-07, "loss": 3.7357211112976074e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 423.3125, "completions/min_length": 384.0, "epoch": 2.351470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.01019195280969143, "kl": 0.00391647923970595, "learning_rate": 9.990595482281177e-07, "loss": 3.881985321640968e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 469.625, "completions/min_length": 384.0, "epoch": 2.3529411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.008342931978404522, "kl": 0.003818161436356604, "learning_rate": 9.990516643685221e-07, "loss": 3.813526200246997e-05, "reward": 0.7016666531562805, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.637499988079071, "rewards/DrugCombAccuracyCOTORM/std": 0.3743883967399597, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.08606630563735962, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 450.875, "completions/min_length": 401.0, "epoch": 2.3544117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.009854558855295181, "kl": 0.0036473138025030494, "learning_rate": 9.990437476326655e-07, "loss": 3.6324017855804414e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 420.375, "completions/min_length": 359.0, "epoch": 2.3558823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.8923723697662354, "kl": 0.003208399110008031, "learning_rate": 9.990357980210699e-07, "loss": 3.1663472327636555e-05, "reward": 0.762499988079071, "reward_std": 0.25599944591522217, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 1602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 476.25, "completions/min_length": 370.0, "epoch": 2.3573529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 1.1413805484771729, "kl": 0.003925500204786658, "learning_rate": 9.99027815534259e-07, "loss": 3.928286969312467e-05, "reward": 0.6568333506584167, "reward_std": 0.1374792605638504, "rewards/DrugCombAccuracyCOTORM/mean": 0.597083330154419, "rewards/DrugCombAccuracyCOTORM/std": 0.46426665782928467, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 1603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 485.1875, "completions/min_length": 415.0, "epoch": 2.3588235294117648, "frac_reward_zero_std": 0.0, "grad_norm": 1.3600389957427979, "kl": 0.0036984969628974795, "learning_rate": 9.99019800172758e-07, "loss": 3.7297606468200684e-05, "reward": 0.7041666507720947, "reward_std": 0.3044792115688324, "rewards/DrugCombAccuracyCOTORM/mean": 0.6354166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.42695629596710205, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.12909944355487823, "step": 1604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 444.5, "completions/min_length": 381.0, "epoch": 2.360294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.2367439270019531, "kl": 0.004804784606676549, "learning_rate": 9.990117519370957e-07, "loss": 4.733726382255554e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 1605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 484.0, "completions/min_length": 410.0, "epoch": 2.361764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.4953763484954834, "kl": 0.00346431732759811, "learning_rate": 9.990036708278022e-07, "loss": 3.481656312942505e-05, "reward": 0.9140416383743286, "reward_std": 0.24312688410282135, "rewards/DrugCombAccuracyCOTORM/mean": 0.8990625143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.27599647641181946, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9479166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.145535409450531, "step": 1606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 468.9375, "completions/min_length": 393.0, "epoch": 2.363235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.009733648039400578, "kl": 0.0032710115774534643, "learning_rate": 9.989955568454095e-07, "loss": 3.2664378522895277e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 467.4375, "completions/min_length": 373.0, "epoch": 2.364705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.8724040389060974, "kl": 0.0036245413357391953, "learning_rate": 9.989874099904522e-07, "loss": 3.621727228164673e-05, "reward": 0.8356666564941406, "reward_std": 0.17567972838878632, "rewards/DrugCombAccuracyCOTORM/mean": 0.8050000071525574, "rewards/DrugCombAccuracyCOTORM/std": 0.3488266170024872, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.14907118678092957, "step": 1608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/mean_length": 402.5, "completions/min_length": 372.0, "epoch": 2.3661764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.8973472118377686, "kl": 0.004823952971491963, "learning_rate": 9.989792302634674e-07, "loss": 4.809349775314331e-05, "reward": 0.6451666355133057, "reward_std": 0.28361278772354126, "rewards/DrugCombAccuracyCOTORM/mean": 0.5824999809265137, "rewards/DrugCombAccuracyCOTORM/std": 0.43676844239234924, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.2687419354915619, "step": 1609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 421.375, "completions/min_length": 386.0, "epoch": 2.3676470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.019030001014471054, "kl": 0.004563732829410583, "learning_rate": 9.989710176649934e-07, "loss": 4.531783997663297e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 456.5, "completions/min_length": 417.0, "epoch": 2.3691176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.9286018013954163, "kl": 0.003316233167424798, "learning_rate": 9.989627721955716e-07, "loss": 3.3323765819659457e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 479.4375, "completions/min_length": 435.0, "epoch": 2.3705882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.088197112083435, "kl": 0.004375846066977829, "learning_rate": 9.989544938557452e-07, "loss": 4.4018030166625977e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 440.625, "completions/min_length": 353.0, "epoch": 2.3720588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.006394032388925552, "kl": 0.0033965623006224632, "learning_rate": 9.989461826460592e-07, "loss": 3.439516149228439e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 405.1875, "completions/min_length": 362.0, "epoch": 2.373529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.015695316717028618, "kl": 0.00505920744035393, "learning_rate": 9.989378385670616e-07, "loss": 5.150598371983506e-05, "reward": 0.6713333129882812, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6100000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.40279027819633484, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 1614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 463.125, "completions/min_length": 378.0, "epoch": 2.375, "frac_reward_zero_std": 1.0, "grad_norm": 0.014764068648219109, "kl": 0.0035745701170526445, "learning_rate": 9.989294616193017e-07, "loss": 3.5790180845651776e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 489.0, "completions/min_length": 439.0, "epoch": 2.376470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8338860869407654, "kl": 0.004720003984402865, "learning_rate": 9.989210518033314e-07, "loss": 4.7093970351852477e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/mean_length": 503.5, "completions/min_length": 410.0, "epoch": 2.3779411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.7371793985366821, "kl": 0.003487207170110196, "learning_rate": 9.98912609119705e-07, "loss": 3.4809112548828125e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 1617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 479.6875, "completions/min_length": 428.0, "epoch": 2.3794117647058823, "frac_reward_zero_std": 0.0, "grad_norm": 2.161802053451538, "kl": 0.00397881621029228, "learning_rate": 9.989041335689786e-07, "loss": 3.9793550968170166e-05, "reward": 0.6910417079925537, "reward_std": 0.3481196463108063, "rewards/DrugCombAccuracyCOTORM/mean": 0.6458333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.4121982753276825, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7437499761581421, "rewards/DrugCombCoverageCOTORM/std": 0.2657536566257477, "step": 1618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 439.6875, "completions/min_length": 386.0, "epoch": 2.3808823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.005530083552002907, "kl": 0.0030234031146392226, "learning_rate": 9.988956251517104e-07, "loss": 3.0304614483611658e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 1619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 430.0625, "completions/min_length": 368.0, "epoch": 2.3823529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.01076787430793047, "kl": 0.0037785492022521794, "learning_rate": 9.98887083868461e-07, "loss": 3.750305768335238e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 428.3125, "completions/min_length": 355.0, "epoch": 2.3838235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0793613195419312, "kl": 0.0042764111422002316, "learning_rate": 9.988785097197927e-07, "loss": 4.2724299419205636e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 469.3125, "completions/min_length": 393.0, "epoch": 2.385294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.011200669221580029, "kl": 0.004153824062086642, "learning_rate": 9.988699027062709e-07, "loss": 4.14535534218885e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 467.9375, "completions/min_length": 422.0, "epoch": 2.386764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8685263395309448, "kl": 0.003850704408250749, "learning_rate": 9.988612628284626e-07, "loss": 3.853440284729004e-05, "reward": 0.8822916746139526, "reward_std": 0.07923711836338043, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.1666666567325592, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8229166269302368, "rewards/DrugCombCoverageCOTORM/std": 0.49241939187049866, "step": 1623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/mean_length": 514.3125, "completions/min_length": 415.0, "epoch": 2.388235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.3159000873565674, "kl": 0.0044490807922557, "learning_rate": 9.988525900869365e-07, "loss": 4.442036151885986e-05, "reward": 0.7817916870117188, "reward_std": 0.3201712369918823, "rewards/DrugCombAccuracyCOTORM/mean": 0.7441666722297668, "rewards/DrugCombAccuracyCOTORM/std": 0.39333614706993103, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8645833730697632, "rewards/DrugCombCoverageCOTORM/std": 0.2803354561328888, "step": 1624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 447.625, "completions/min_length": 379.0, "epoch": 2.389705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.021420398727059364, "kl": 0.0053391383262351155, "learning_rate": 9.988438844822642e-07, "loss": 5.3141291573410854e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/mean_length": 503.9375, "completions/min_length": 423.0, "epoch": 2.3911764705882352, "frac_reward_zero_std": 0.0, "grad_norm": 1.2840598821640015, "kl": 0.003415203944314271, "learning_rate": 9.988351460150193e-07, "loss": 3.49096953868866e-05, "reward": 0.5901666879653931, "reward_std": 0.19087685644626617, "rewards/DrugCombAccuracyCOTORM/mean": 0.5762500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.49902406334877014, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.2916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.8595865368843079, "step": 1626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 452.0, "completions/min_length": 373.0, "epoch": 2.3926470588235293, "frac_reward_zero_std": 0.0, "grad_norm": 1.4987523555755615, "kl": 0.0033378638327121735, "learning_rate": 9.988263746857772e-07, "loss": 3.319978713989258e-05, "reward": 0.8500000238418579, "reward_std": 0.3265853524208069, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 445.1875, "completions/min_length": 386.0, "epoch": 2.3941176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9224516749382019, "kl": 0.004249455116223544, "learning_rate": 9.988175704951162e-07, "loss": 4.2287749238312244e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 416.25, "completions/min_length": 354.0, "epoch": 2.3955882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.012173500843346119, "kl": 0.00429917371366173, "learning_rate": 9.988087334436158e-07, "loss": 4.284373426344246e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 430.25, "completions/min_length": 357.0, "epoch": 2.3970588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.009755766950547695, "kl": 0.003319297160487622, "learning_rate": 9.987998635318584e-07, "loss": 3.318745439173654e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/mean_length": 519.0, "completions/min_length": 455.0, "epoch": 2.398529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.5662082433700562, "kl": 0.004422753001563251, "learning_rate": 9.987909607604284e-07, "loss": 4.436075687408447e-05, "reward": 0.45819446444511414, "reward_std": 0.37359893321990967, "rewards/DrugCombAccuracyCOTORM/mean": 0.4000000059604645, "rewards/DrugCombAccuracyCOTORM/std": 0.41633322834968567, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3819444477558136, "rewards/DrugCombCoverageCOTORM/std": 0.6690928339958191, "step": 1631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/mean_length": 499.9375, "completions/min_length": 425.0, "epoch": 2.4, "frac_reward_zero_std": 0.5, "grad_norm": 0.8715343475341797, "kl": 0.0034111483837477863, "learning_rate": 9.98782025129912e-07, "loss": 3.3995653211604804e-05, "reward": 0.8453124761581421, "reward_std": 0.21348902583122253, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.953125, "rewards/DrugCombCoverageCOTORM/std": 0.10077822208404541, "step": 1632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/mean_length": 465.0, "completions/min_length": 367.0, "epoch": 2.401470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8944588303565979, "kl": 0.004532657912932336, "learning_rate": 9.987730566408983e-07, "loss": 4.573911428451538e-05, "reward": 0.7106666564941406, "reward_std": 0.1936037540435791, "rewards/DrugCombAccuracyCOTORM/mean": 0.6800000071525574, "rewards/DrugCombAccuracyCOTORM/std": 0.4316789209842682, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.6666666865348816, "step": 1633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 434.25, "completions/min_length": 381.0, "epoch": 2.402941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.02081987075507641, "kl": 0.0037853732355870306, "learning_rate": 9.987640552939776e-07, "loss": 3.717838626471348e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 433.25, "completions/min_length": 380.0, "epoch": 2.4044117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.010744133964180946, "kl": 0.003433407284319401, "learning_rate": 9.987550210897432e-07, "loss": 3.436009137658402e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 477.5625, "completions/min_length": 425.0, "epoch": 2.4058823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.9907541275024414, "kl": 0.0031089260592125356, "learning_rate": 9.987459540287903e-07, "loss": 3.100948015344329e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 460.6875, "completions/min_length": 400.0, "epoch": 2.4073529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.7919429540634155, "kl": 0.003544237930327654, "learning_rate": 9.987368541117162e-07, "loss": 3.54573130607605e-05, "reward": 0.942187488079071, "reward_std": 0.16351844370365143, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 452.125, "completions/min_length": 378.0, "epoch": 2.4088235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 1.9940578937530518, "kl": 0.004645655513741076, "learning_rate": 9.987277213391202e-07, "loss": 4.6321627451106906e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/mean_length": 478.5625, "completions/min_length": 391.0, "epoch": 2.4102941176470587, "frac_reward_zero_std": 0.5, "grad_norm": 2.756023645401001, "kl": 0.005088526231702417, "learning_rate": 9.98718555711604e-07, "loss": 5.066260200692341e-05, "reward": 0.5545499920845032, "reward_std": 0.012869343161582947, "rewards/DrugCombAccuracyCOTORM/mean": 0.5041249990463257, "rewards/DrugCombAccuracyCOTORM/std": 0.5123854875564575, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.512499988079071, "rewards/DrugCombCoverageCOTORM/std": 0.5057997107505798, "step": 1639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 419.6875, "completions/min_length": 392.0, "epoch": 2.411764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.009621541015803814, "kl": 0.004395535390358418, "learning_rate": 9.987093572297715e-07, "loss": 4.355331839178689e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 491.625, "completions/min_length": 436.0, "epoch": 2.413235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.23709237575531, "kl": 0.005161843495443463, "learning_rate": 9.987001258942288e-07, "loss": 5.130469799041748e-05, "reward": 0.921625018119812, "reward_std": 0.14512230455875397, "rewards/DrugCombAccuracyCOTORM/mean": 0.9059374928474426, "rewards/DrugCombAccuracyCOTORM/std": 0.25702768564224243, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.08539126068353653, "step": 1641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 527.0625, "completions/min_length": 488.0, "epoch": 2.414705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.028581304475665092, "kl": 0.00523324852110818, "learning_rate": 9.986908617055834e-07, "loss": 5.304298611008562e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/mean_length": 559.875, "completions/min_length": 460.0, "epoch": 2.416176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.1815736293792725, "kl": 0.00444505549967289, "learning_rate": 9.986815646644464e-07, "loss": 4.484504461288452e-05, "reward": 0.8195832967758179, "reward_std": 0.1976885348558426, "rewards/DrugCombAccuracyCOTORM/mean": 0.8187500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.34683093428611755, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6458333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.6827911138534546, "step": 1643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 463.375, "completions/min_length": 400.0, "epoch": 2.4176470588235293, "frac_reward_zero_std": 0.0, "grad_norm": 1.3931831121444702, "kl": 0.003844756633043289, "learning_rate": 9.9867223477143e-07, "loss": 3.83816659450531e-05, "reward": 0.6000000238418579, "reward_std": 0.3618106245994568, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 1644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 443.0, "completions/min_length": 409.0, "epoch": 2.4191176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.00745248980820179, "kl": 0.0033249904518015683, "learning_rate": 9.986628720271485e-07, "loss": 3.311917680548504e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 669.0, "completions/mean_length": 501.3125, "completions/min_length": 413.0, "epoch": 2.4205882352941175, "frac_reward_zero_std": 0.5, "grad_norm": 0.9857500791549683, "kl": 0.004720754572190344, "learning_rate": 9.98653476432219e-07, "loss": 4.87305223941803e-05, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 1646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 448.25, "completions/min_length": 363.0, "epoch": 2.4220588235294116, "frac_reward_zero_std": 1.0, "grad_norm": 394.4491271972656, "kl": 0.5914249592460692, "learning_rate": 9.986440479872604e-07, "loss": 0.0068100676871836185, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 442.0, "completions/min_length": 369.0, "epoch": 2.4235294117647057, "frac_reward_zero_std": 0.5, "grad_norm": 1.045365810394287, "kl": 0.003521005797665566, "learning_rate": 9.98634586692894e-07, "loss": 3.542008198564872e-05, "reward": 0.7562500238418579, "reward_std": 0.2610931694507599, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.8139410614967346, "step": 1648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 461.4375, "completions/min_length": 424.0, "epoch": 2.425, "frac_reward_zero_std": 1.0, "grad_norm": 0.007346952799707651, "kl": 0.003389576217159629, "learning_rate": 9.986250925497428e-07, "loss": 3.386128810234368e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 418.0, "completions/min_length": 352.0, "epoch": 2.426470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.014785043895244598, "kl": 0.004890708369202912, "learning_rate": 9.986155655584324e-07, "loss": 4.849032120546326e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 460.1875, "completions/min_length": 367.0, "epoch": 2.427941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.0496948957443237, "kl": 0.003604383731726557, "learning_rate": 9.986060057195902e-07, "loss": 3.6226832889951766e-05, "reward": 0.9437500238418579, "reward_std": 0.10415476560592651, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.17078252136707306, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.17078252136707306, "step": 1651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 461.1875, "completions/min_length": 389.0, "epoch": 2.429411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.007503276225179434, "kl": 0.0027713340241461992, "learning_rate": 9.985964130338465e-07, "loss": 2.7650437914417125e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 450.5625, "completions/min_length": 384.0, "epoch": 2.4308823529411763, "frac_reward_zero_std": 1.0, "grad_norm": 0.012335380539298058, "kl": 0.003748876159079373, "learning_rate": 9.985867875018326e-07, "loss": 3.7513585994020104e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 481.875, "completions/min_length": 422.0, "epoch": 2.432352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.3729873895645142, "kl": 0.005960763548500836, "learning_rate": 9.98577129124183e-07, "loss": 6.007550109643489e-05, "reward": 0.9551249742507935, "reward_std": 0.12692566215991974, "rewards/DrugCombAccuracyCOTORM/mean": 0.9478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.20874999463558197, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 1654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 511.75, "completions/min_length": 400.0, "epoch": 2.4338235294117645, "frac_reward_zero_std": 0.0, "grad_norm": 1.4619667530059814, "kl": 0.003950849873945117, "learning_rate": 9.985674379015338e-07, "loss": 4.009902477264404e-05, "reward": 0.75, "reward_std": 0.35523033142089844, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 421.0625, "completions/min_length": 396.0, "epoch": 2.435294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.007344153709709644, "kl": 0.0030095691909082234, "learning_rate": 9.985577138345238e-07, "loss": 3.007285704370588e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 500.625, "completions/min_length": 439.0, "epoch": 2.4367647058823527, "frac_reward_zero_std": 0.0, "grad_norm": 1.5160459280014038, "kl": 0.006099842605181038, "learning_rate": 9.98547956923793e-07, "loss": 6.135553121566772e-05, "reward": 0.6499999761581421, "reward_std": 0.3265853524208069, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 421.875, "completions/min_length": 361.0, "epoch": 2.4382352941176473, "frac_reward_zero_std": 1.0, "grad_norm": 0.010395562276244164, "kl": 0.0037111223209649324, "learning_rate": 9.985381671699846e-07, "loss": 3.698697764775716e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 1658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 411.875, "completions/min_length": 347.0, "epoch": 2.439705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 1.6789003610610962, "kl": 0.00437262695049867, "learning_rate": 9.985283445737434e-07, "loss": 4.521757364273071e-05, "reward": 0.21250000596046448, "reward_std": 0.3181980550289154, "rewards/DrugCombAccuracyCOTORM/mean": 0.125, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 1659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 438.625, "completions/min_length": 419.0, "epoch": 2.4411764705882355, "frac_reward_zero_std": 0.5, "grad_norm": 1.079737663269043, "kl": 0.004943665291648358, "learning_rate": 9.985184891357164e-07, "loss": 5.0182472477899864e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 435.8125, "completions/min_length": 347.0, "epoch": 2.442647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.03560442477464676, "kl": 0.0034379325807094574, "learning_rate": 9.985086008565529e-07, "loss": 3.454151737969369e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 483.5, "completions/min_length": 449.0, "epoch": 2.4441176470588237, "frac_reward_zero_std": 0.5, "grad_norm": 0.9300119876861572, "kl": 0.003164128225762397, "learning_rate": 9.984986797369043e-07, "loss": 3.182142972946167e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 1662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 448.0625, "completions/min_length": 361.0, "epoch": 2.445588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.01506397407501936, "kl": 0.004622807144187391, "learning_rate": 9.984887257774245e-07, "loss": 4.6051220124354586e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/mean_length": 471.375, "completions/min_length": 330.0, "epoch": 2.447058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.019402511417865753, "kl": 0.004511935927439481, "learning_rate": 9.984787389787688e-07, "loss": 4.5254073484102264e-05, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.12909944355487823, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 421.25, "completions/min_length": 331.0, "epoch": 2.448529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.175638198852539, "kl": 0.004110701731406152, "learning_rate": 9.984687193415952e-07, "loss": 4.1315059206681326e-05, "reward": 0.16250000894069672, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.125, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.375, "rewards/DrugCombCoverageCOTORM/std": 0.7187952995300293, "step": 1665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 482.0625, "completions/min_length": 397.0, "epoch": 2.45, "frac_reward_zero_std": 0.5, "grad_norm": 0.986763060092926, "kl": 0.0037047655787318945, "learning_rate": 9.98458666866564e-07, "loss": 3.684167313622311e-05, "reward": 0.9354166984558105, "reward_std": 0.09005619585514069, "rewards/DrugCombAccuracyCOTORM/mean": 0.9270833730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.16065548360347748, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.17078252136707306, "step": 1666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 426.5625, "completions/min_length": 379.0, "epoch": 2.4514705882352943, "frac_reward_zero_std": 0.5, "grad_norm": 1.021036982536316, "kl": 0.003521260106936097, "learning_rate": 9.98448581554337e-07, "loss": 3.5366174415685236e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 456.375, "completions/min_length": 415.0, "epoch": 2.4529411764705884, "frac_reward_zero_std": 1.0, "grad_norm": 0.04186692461371422, "kl": 0.005443534755613655, "learning_rate": 9.984384634055792e-07, "loss": 5.448472802527249e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 436.4375, "completions/min_length": 384.0, "epoch": 2.4544117647058825, "frac_reward_zero_std": 1.0, "grad_norm": 0.016164736822247505, "kl": 0.004157147486694157, "learning_rate": 9.984283124209566e-07, "loss": 4.158496085437946e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 467.625, "completions/min_length": 412.0, "epoch": 2.4558823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 0.8911771774291992, "kl": 0.004389154375530779, "learning_rate": 9.984181286011382e-07, "loss": 4.388391971588135e-05, "reward": 0.9604166746139526, "reward_std": 0.07329408079385757, "rewards/DrugCombAccuracyCOTORM/mean": 0.9583333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.11385500431060791, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.17078252136707306, "step": 1670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/mean_length": 496.5, "completions/min_length": 423.0, "epoch": 2.4573529411764707, "frac_reward_zero_std": 0.0, "grad_norm": 1.3563963174819946, "kl": 0.0028101725038141012, "learning_rate": 9.984079119467948e-07, "loss": 2.7880072593688965e-05, "reward": 0.37958335876464844, "reward_std": 0.3066340684890747, "rewards/DrugCombAccuracyCOTORM/mean": 0.22708334028720856, "rewards/DrugCombAccuracyCOTORM/std": 0.38516542315483093, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 1671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 436.4375, "completions/min_length": 374.0, "epoch": 2.458823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 1.452046275138855, "kl": 0.0036175717832520604, "learning_rate": 9.983976624585996e-07, "loss": 3.631412982940674e-05, "reward": 0.5249999761581421, "reward_std": 0.4475547969341278, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 1672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 447.3125, "completions/min_length": 377.0, "epoch": 2.460294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.007121759001165628, "kl": 0.0032529454911127687, "learning_rate": 9.983873801372274e-07, "loss": 3.262728569097817e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 436.4375, "completions/min_length": 375.0, "epoch": 2.461764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.015625156462192535, "kl": 0.004078480415046215, "learning_rate": 9.98377064983356e-07, "loss": 4.098113640793599e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 438.125, "completions/min_length": 385.0, "epoch": 2.463235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01157910842448473, "kl": 0.00400342681678012, "learning_rate": 9.983667169976649e-07, "loss": 4.028478724649176e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 452.125, "completions/min_length": 408.0, "epoch": 2.4647058823529413, "frac_reward_zero_std": 0.5, "grad_norm": 1.2477189302444458, "kl": 0.00433814205462113, "learning_rate": 9.983563361808357e-07, "loss": 4.353341137175448e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 1676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 473.6875, "completions/min_length": 401.0, "epoch": 2.4661764705882354, "frac_reward_zero_std": 0.0, "grad_norm": 1.6814929246902466, "kl": 0.004413575632497668, "learning_rate": 9.983459225335522e-07, "loss": 4.4286251068115234e-05, "reward": 0.5562499761581421, "reward_std": 0.3005203604698181, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 1677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 477.4375, "completions/min_length": 425.0, "epoch": 2.4676470588235295, "frac_reward_zero_std": 0.0, "grad_norm": 1.4512639045715332, "kl": 0.0035752771655097604, "learning_rate": 9.983354760565005e-07, "loss": 3.5274773836135864e-05, "reward": 0.53125, "reward_std": 0.41806113719940186, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 1678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/mean_length": 513.875, "completions/min_length": 427.0, "epoch": 2.4691176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.4935170412063599, "kl": 0.0039837423246353865, "learning_rate": 9.983249967503686e-07, "loss": 3.870576620101929e-05, "reward": 0.8177083134651184, "reward_std": 0.3590848445892334, "rewards/DrugCombAccuracyCOTORM/mean": 0.7916666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.4013864994049072, "rewards/DrugCombCOTFormatORM/mean": 0.9375, "rewards/DrugCombCOTFormatORM/std": 0.17078252136707306, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 1679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 461.3125, "completions/min_length": 408.0, "epoch": 2.4705882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.019536152482032776, "kl": 0.005060689116362482, "learning_rate": 9.98314484615847e-07, "loss": 5.0758550059981644e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/mean_length": 505.75, "completions/min_length": 432.0, "epoch": 2.472058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.009901217184960842, "kl": 0.0036139609292149544, "learning_rate": 9.983039396536287e-07, "loss": 3.60799822374247e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 460.375, "completions/min_length": 409.0, "epoch": 2.473529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.01029470469802618, "kl": 0.004189266706816852, "learning_rate": 9.982933618644073e-07, "loss": 4.2073719669133425e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 1682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 453.9375, "completions/min_length": 402.0, "epoch": 2.475, "frac_reward_zero_std": 1.0, "grad_norm": 0.007850782945752144, "kl": 0.0034239673987030983, "learning_rate": 9.982827512488808e-07, "loss": 3.410570207051933e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 418.6875, "completions/min_length": 369.0, "epoch": 2.476470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.00855038408190012, "kl": 0.003060207818634808, "learning_rate": 9.982721078077471e-07, "loss": 3.058178845094517e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 452.9375, "completions/min_length": 363.0, "epoch": 2.4779411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.0143396854400635, "kl": 0.003148556628730148, "learning_rate": 9.982614315417083e-07, "loss": 3.1478703022003174e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 1685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 456.3125, "completions/min_length": 399.0, "epoch": 2.4794117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.01774313673377037, "kl": 0.00419424258871004, "learning_rate": 9.98250722451467e-07, "loss": 4.192340566078201e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 488.625, "completions/min_length": 419.0, "epoch": 2.4808823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.8729391098022461, "kl": 0.004443822079338133, "learning_rate": 9.982399805377296e-07, "loss": 4.398822784423828e-05, "reward": 0.8168541193008423, "reward_std": 0.1631818413734436, "rewards/DrugCombAccuracyCOTORM/mean": 0.7814843654632568, "rewards/DrugCombAccuracyCOTORM/std": 0.34748417139053345, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.14907118678092957, "step": 1687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 404.3125, "completions/min_length": 322.0, "epoch": 2.4823529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 0.9707460999488831, "kl": 0.004450141277629882, "learning_rate": 9.982292058012025e-07, "loss": 4.4114887714385986e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 447.625, "completions/min_length": 394.0, "epoch": 2.4838235294117648, "frac_reward_zero_std": 0.5, "grad_norm": 1.2916566133499146, "kl": 0.00599446299020201, "learning_rate": 9.982183982425967e-07, "loss": 5.99399209022522e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 420.5625, "completions/min_length": 375.0, "epoch": 2.485294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.012476721778512001, "kl": 0.004250301048159599, "learning_rate": 9.982075578626232e-07, "loss": 4.2561259760987014e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 459.375, "completions/min_length": 402.0, "epoch": 2.486764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.12313081324100494, "kl": 0.005972809682134539, "learning_rate": 9.98196684661997e-07, "loss": 6.090258830226958e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 446.8125, "completions/min_length": 381.0, "epoch": 2.488235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.006039934232831001, "kl": 0.0029639191343449056, "learning_rate": 9.981857786414338e-07, "loss": 2.9543380151153542e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/mean_length": 495.875, "completions/min_length": 425.0, "epoch": 2.489705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9673230648040771, "kl": 0.004060895764268935, "learning_rate": 9.981748398016522e-07, "loss": 4.0607526898384094e-05, "reward": 0.652999997138977, "reward_std": 0.03111269511282444, "rewards/DrugCombAccuracyCOTORM/mean": 0.5870833396911621, "rewards/DrugCombAccuracyCOTORM/std": 0.42975595593452454, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 1693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 716.0, "completions/mean_length": 560.3125, "completions/min_length": 430.0, "epoch": 2.4911764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.7736046314239502, "kl": 0.0035530278692021966, "learning_rate": 9.98163868143373e-07, "loss": 3.5740435123443604e-05, "reward": 0.7186111211776733, "reward_std": 0.048234257847070694, "rewards/DrugCombAccuracyCOTORM/mean": 0.6499999761581421, "rewards/DrugCombAccuracyCOTORM/std": 0.3711842894554138, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9861111044883728, "rewards/DrugCombCoverageCOTORM/std": 0.0555555522441864, "step": 1694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 467.875, "completions/min_length": 408.0, "epoch": 2.4926470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.13257808983325958, "kl": 0.0046648632269352674, "learning_rate": 9.981528636673189e-07, "loss": 4.547792923403904e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 430.75, "completions/min_length": 379.0, "epoch": 2.4941176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.009614173322916031, "kl": 0.003598364011850208, "learning_rate": 9.981418263742148e-07, "loss": 3.602752985898405e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 496.75, "completions/min_length": 442.0, "epoch": 2.4955882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.0482934713363647, "kl": 0.0041555846109986305, "learning_rate": 9.981307562647876e-07, "loss": 4.167325096204877e-05, "reward": 0.5713333487510681, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.48500001430511475, "rewards/DrugCombAccuracyCOTORM/std": 0.4182184636592865, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 1697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 446.5625, "completions/min_length": 405.0, "epoch": 2.4970588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.0892910957336426, "kl": 0.0037583623779937625, "learning_rate": 9.98119653339767e-07, "loss": 3.7691250327043235e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/mean_length": 469.75, "completions/min_length": 409.0, "epoch": 2.498529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.9533227682113647, "kl": 0.004359433311037719, "learning_rate": 9.98108517599884e-07, "loss": 4.364843698567711e-05, "reward": 0.18124999105930328, "reward_std": 0.1791597455739975, "rewards/DrugCombAccuracyCOTORM/mean": 0.09375, "rewards/DrugCombAccuracyCOTORM/std": 0.2719528079032898, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0625, "rewards/DrugCombCoverageCOTORM/std": 0.6800735592842102, "step": 1699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/mean_length": 389.75, "completions/min_length": 358.0, "epoch": 2.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.007364716846495867, "kl": 0.0028078494069632143, "learning_rate": 9.980973490458728e-07, "loss": 2.8301139536779374e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 454.0, "completions/min_length": 394.0, "epoch": 2.501470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.1352651119232178, "kl": 0.003989504009950906, "learning_rate": 9.980861476784686e-07, "loss": 3.9830803871154785e-05, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 1701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 805.0, "completions/mean_length": 534.25, "completions/min_length": 414.0, "epoch": 2.5029411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.008561474271118641, "kl": 0.0035089017474092543, "learning_rate": 9.980749134984092e-07, "loss": 3.474981713225134e-05, "reward": 0.800000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 453.0625, "completions/min_length": 377.0, "epoch": 2.5044117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.00885007344186306, "kl": 0.003797178389504552, "learning_rate": 9.980636465064354e-07, "loss": 3.8282676541712135e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/mean_length": 529.625, "completions/min_length": 442.0, "epoch": 2.5058823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.9347149133682251, "kl": 0.004484927514567971, "learning_rate": 9.980523467032887e-07, "loss": 4.492163498071022e-05, "reward": 0.7056547403335571, "reward_std": 0.160870760679245, "rewards/DrugCombAccuracyCOTORM/mean": 0.6607142686843872, "rewards/DrugCombAccuracyCOTORM/std": 0.4223087430000305, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7708333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5266069173812866, "step": 1704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 434.9375, "completions/min_length": 383.0, "epoch": 2.5073529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.230976939201355, "kl": 0.003957560926210135, "learning_rate": 9.98041014089714e-07, "loss": 3.941619797842577e-05, "reward": 0.6875, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 1705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 454.3125, "completions/min_length": 412.0, "epoch": 2.5088235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0779945850372314, "kl": 0.0044120075181126595, "learning_rate": 9.980296486664576e-07, "loss": 4.4224580051377416e-05, "reward": 0.921625018119812, "reward_std": 0.14512230455875397, "rewards/DrugCombAccuracyCOTORM/mean": 0.9059374928474426, "rewards/DrugCombAccuracyCOTORM/std": 0.25702768564224243, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.08539126068353653, "step": 1706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 444.875, "completions/min_length": 402.0, "epoch": 2.510294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.01331617683172226, "kl": 0.003701700596138835, "learning_rate": 9.980182504342682e-07, "loss": 3.6865945730824023e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 435.3125, "completions/min_length": 394.0, "epoch": 2.511764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.0064706564880907536, "kl": 0.00312754133483395, "learning_rate": 9.980068193938969e-07, "loss": 3.094313433393836e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 435.875, "completions/min_length": 397.0, "epoch": 2.513235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1344068050384521, "kl": 0.0034255421487614512, "learning_rate": 9.979953555460968e-07, "loss": 3.4065415093209594e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/mean_length": 491.25, "completions/min_length": 444.0, "epoch": 2.514705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 1.5478038787841797, "kl": 0.00391081563429907, "learning_rate": 9.979838588916228e-07, "loss": 3.9167702198028564e-05, "reward": 0.737500011920929, "reward_std": 0.4153292179107666, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 1710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 485.5, "completions/min_length": 422.0, "epoch": 2.5161764705882352, "frac_reward_zero_std": 0.5, "grad_norm": 0.9832400679588318, "kl": 0.003870766144245863, "learning_rate": 9.979723294312324e-07, "loss": 3.8288944779196754e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 1711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 483.875, "completions/min_length": 404.0, "epoch": 2.5176470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.020119214430451393, "kl": 0.0049858951242640615, "learning_rate": 9.97960767165685e-07, "loss": 4.942130544804968e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 422.1875, "completions/min_length": 334.0, "epoch": 2.5191176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9480982422828674, "kl": 0.0032856083125807345, "learning_rate": 9.979491720957425e-07, "loss": 3.284366903244518e-05, "reward": 0.6187499761581421, "reward_std": 0.15797266364097595, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.6020797491073608, "step": 1713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/mean_length": 513.8125, "completions/min_length": 426.0, "epoch": 2.5205882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.1770434379577637, "kl": 0.0036366847925819457, "learning_rate": 9.97937544222169e-07, "loss": 3.6522746086120605e-05, "reward": 0.2802083492279053, "reward_std": 0.2515009641647339, "rewards/DrugCombAccuracyCOTORM/mean": 0.2291666716337204, "rewards/DrugCombAccuracyCOTORM/std": 0.4166666567325592, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.03125, "rewards/DrugCombCoverageCOTORM/std": 0.6944722533226013, "step": 1714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 479.25, "completions/min_length": 418.0, "epoch": 2.5220588235294117, "frac_reward_zero_std": 0.0, "grad_norm": 1.3235212564468384, "kl": 0.0035099555971100926, "learning_rate": 9.979258835457298e-07, "loss": 3.503262996673584e-05, "reward": 0.8354166746139526, "reward_std": 0.27522021532058716, "rewards/DrugCombAccuracyCOTORM/mean": 0.8020833730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.35075974464416504, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.17078252136707306, "step": 1715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 418.3125, "completions/min_length": 361.0, "epoch": 2.523529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.01614062488079071, "kl": 0.003562816884368658, "learning_rate": 9.979141900671937e-07, "loss": 3.517538061714731e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 393.9375, "completions/min_length": 343.0, "epoch": 2.525, "frac_reward_zero_std": 1.0, "grad_norm": 0.005961697082966566, "kl": 0.0034124667872674763, "learning_rate": 9.979024637873308e-07, "loss": 3.4337812394369394e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 466.1875, "completions/min_length": 380.0, "epoch": 2.526470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.3108141422271729, "kl": 0.008373683725949377, "learning_rate": 9.978907047069135e-07, "loss": 8.444488048553467e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 492.25, "completions/min_length": 389.0, "epoch": 2.527941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.4061745405197144, "kl": 0.006183752499055117, "learning_rate": 9.978789128267168e-07, "loss": 6.164610385894775e-05, "reward": 0.653499960899353, "reward_std": 0.3373071253299713, "rewards/DrugCombAccuracyCOTORM/mean": 0.5824999809265137, "rewards/DrugCombAccuracyCOTORM/std": 0.43676844239234924, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.1666666567325592, "step": 1719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 433.125, "completions/min_length": 381.0, "epoch": 2.5294117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.019245799630880356, "kl": 0.004706271574832499, "learning_rate": 9.978670881475172e-07, "loss": 4.738805000670254e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 466.5625, "completions/min_length": 341.0, "epoch": 2.5308823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.2570650577545166, "kl": 0.004742954741232097, "learning_rate": 9.978552306700937e-07, "loss": 4.7072768211364746e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 462.0625, "completions/min_length": 403.0, "epoch": 2.5323529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.9135363698005676, "kl": 0.0047945783589966595, "learning_rate": 9.978433403952276e-07, "loss": 4.7229230403900146e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 455.1875, "completions/min_length": 397.0, "epoch": 2.5338235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.02712559886276722, "kl": 0.004449471482075751, "learning_rate": 9.97831417323702e-07, "loss": 4.4894113671034575e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 437.4375, "completions/min_length": 404.0, "epoch": 2.5352941176470587, "frac_reward_zero_std": 1.0, "grad_norm": 0.008569988422095776, "kl": 0.002943483181297779, "learning_rate": 9.978194614563028e-07, "loss": 2.9293934858287685e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 462.5625, "completions/min_length": 365.0, "epoch": 2.536764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.2845747470855713, "kl": 0.0035220651188865304, "learning_rate": 9.97807472793817e-07, "loss": 3.5181641578674316e-05, "reward": 0.78125, "reward_std": 0.3743184804916382, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 1725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 447.75, "completions/min_length": 412.0, "epoch": 2.538235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.00929077435284853, "kl": 0.0038432125584222376, "learning_rate": 9.977954513370347e-07, "loss": 3.858159470837563e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 476.625, "completions/min_length": 400.0, "epoch": 2.539705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.8585336208343506, "kl": 0.002863353176508099, "learning_rate": 9.97783397086748e-07, "loss": 2.8759241104125977e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 434.1875, "completions/min_length": 387.0, "epoch": 2.541176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.4327234029769897, "kl": 0.0038790787220932543, "learning_rate": 9.977713100437508e-07, "loss": 3.8994476199150085e-05, "reward": 0.8544583320617676, "reward_std": 0.22600455582141876, "rewards/DrugCombAccuracyCOTORM/mean": 0.8506250381469727, "rewards/DrugCombAccuracyCOTORM/std": 0.3459377586841583, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7395833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.6803287863731384, "step": 1728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/mean_length": 460.5, "completions/min_length": 344.0, "epoch": 2.5426470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 0.9655739665031433, "kl": 0.0037677810178138316, "learning_rate": 9.977591902088393e-07, "loss": 3.74913215637207e-05, "reward": 0.7716249823570251, "reward_std": 0.1924738883972168, "rewards/DrugCombAccuracyCOTORM/mean": 0.7184374928474426, "rewards/DrugCombAccuracyCOTORM/std": 0.43696483969688416, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.08539126068353653, "step": 1729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 473.875, "completions/min_length": 407.0, "epoch": 2.5441176470588234, "frac_reward_zero_std": 0.5, "grad_norm": 1.0299676656723022, "kl": 0.0037402979214675725, "learning_rate": 9.97747037582812e-07, "loss": 3.739381645573303e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 422.1875, "completions/min_length": 364.0, "epoch": 2.5455882352941175, "frac_reward_zero_std": 1.0, "grad_norm": 0.01551104336977005, "kl": 0.002932204690296203, "learning_rate": 9.977348521664697e-07, "loss": 2.9221448130556382e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 500.75, "completions/min_length": 435.0, "epoch": 2.5470588235294116, "frac_reward_zero_std": 0.0, "grad_norm": 1.302538275718689, "kl": 0.004020343301817775, "learning_rate": 9.977226339606148e-07, "loss": 4.011392593383789e-05, "reward": 0.6108333468437195, "reward_std": 0.3139844536781311, "rewards/DrugCombAccuracyCOTORM/mean": 0.5604166984558105, "rewards/DrugCombAccuracyCOTORM/std": 0.4697073996067047, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5322906970977783, "step": 1732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 448.5, "completions/min_length": 396.0, "epoch": 2.548529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.1990649551153183, "kl": 0.006414335977751762, "learning_rate": 9.977103829660523e-07, "loss": 6.289785960689187e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 447.9375, "completions/min_length": 408.0, "epoch": 2.55, "frac_reward_zero_std": 1.0, "grad_norm": 0.01150593627244234, "kl": 0.004272476420737803, "learning_rate": 9.976980991835893e-07, "loss": 4.2696556192822754e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 501.9375, "completions/min_length": 452.0, "epoch": 2.5514705882352944, "frac_reward_zero_std": 0.5, "grad_norm": 1.116646647453308, "kl": 0.004976971715223044, "learning_rate": 9.976857826140352e-07, "loss": 5.095452070236206e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/mean_length": 518.875, "completions/min_length": 446.0, "epoch": 2.552941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9972501397132874, "kl": 0.007128773373551667, "learning_rate": 9.976734332582011e-07, "loss": 7.241964340209961e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 436.125, "completions/min_length": 364.0, "epoch": 2.5544117647058826, "frac_reward_zero_std": 0.5, "grad_norm": 0.9131126999855042, "kl": 0.0033961344743147492, "learning_rate": 9.976610511169008e-07, "loss": 3.411457146285102e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 445.5625, "completions/min_length": 400.0, "epoch": 2.5558823529411763, "frac_reward_zero_std": 1.0, "grad_norm": 0.022063085809350014, "kl": 0.005048756022006273, "learning_rate": 9.976486361909498e-07, "loss": 5.0222199206473306e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 474.1875, "completions/min_length": 416.0, "epoch": 2.557352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.4892538785934448, "kl": 0.004358379985205829, "learning_rate": 9.97636188481166e-07, "loss": 4.3705105781555176e-05, "reward": 0.78125, "reward_std": 0.38548368215560913, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5439056158065796, "step": 1739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 457.625, "completions/min_length": 382.0, "epoch": 2.5588235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 0.8667060136795044, "kl": 0.0034400057047605515, "learning_rate": 9.976237079883694e-07, "loss": 3.467418719083071e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 1740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 475.5, "completions/min_length": 413.0, "epoch": 2.560294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.0974845886230469, "kl": 0.006465876824222505, "learning_rate": 9.976111947133825e-07, "loss": 6.443081656470895e-05, "reward": 0.3499999940395355, "reward_std": 0.26726123690605164, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 1741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 427.5, "completions/min_length": 361.0, "epoch": 2.5617647058823527, "frac_reward_zero_std": 0.5, "grad_norm": 0.9621081948280334, "kl": 0.003391355276107788, "learning_rate": 9.975986486570292e-07, "loss": 3.41582162945997e-05, "reward": 0.5249999761581421, "reward_std": 0.04629100486636162, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.25, "rewards/DrugCombCoverageCOTORM/std": 1.0, "step": 1742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 405.0625, "completions/min_length": 339.0, "epoch": 2.5632352941176473, "frac_reward_zero_std": 0.5, "grad_norm": 1.0821466445922852, "kl": 0.003933995612896979, "learning_rate": 9.975860698201362e-07, "loss": 3.939494490623474e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/mean_length": 561.4375, "completions/min_length": 491.0, "epoch": 2.564705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 1.4195061922073364, "kl": 0.004919214814435691, "learning_rate": 9.975734582035321e-07, "loss": 4.9151480197906494e-05, "reward": 0.44999998807907104, "reward_std": 0.39218372106552124, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 443.0625, "completions/min_length": 396.0, "epoch": 2.5661764705882355, "frac_reward_zero_std": 0.5, "grad_norm": 1.2976797819137573, "kl": 0.005239386577159166, "learning_rate": 9.975608138080479e-07, "loss": 5.2206218242645264e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 454.5625, "completions/min_length": 424.0, "epoch": 2.567647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.990541398525238, "kl": 0.00361673254519701, "learning_rate": 9.975481366345165e-07, "loss": 3.626197576522827e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 495.125, "completions/min_length": 423.0, "epoch": 2.5691176470588237, "frac_reward_zero_std": 0.5, "grad_norm": 1.1994173526763916, "kl": 0.004611029056832194, "learning_rate": 9.975354266837728e-07, "loss": 4.594992060447112e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 453.25, "completions/min_length": 424.0, "epoch": 2.5705882352941174, "frac_reward_zero_std": 1.0, "grad_norm": 0.016052348539233208, "kl": 0.0034152308362536132, "learning_rate": 9.975226839566545e-07, "loss": 3.3979638828895986e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 446.75, "completions/min_length": 392.0, "epoch": 2.572058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.8822598457336426, "kl": 0.003498835547361523, "learning_rate": 9.975099084540007e-07, "loss": 3.536790609359741e-05, "reward": 0.8698333501815796, "reward_std": 0.028292685747146606, "rewards/DrugCombAccuracyCOTORM/mean": 0.8477083444595337, "rewards/DrugCombAccuracyCOTORM/std": 0.16454075276851654, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.08606630563735962, "step": 1749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 470.1875, "completions/min_length": 405.0, "epoch": 2.5735294117647056, "frac_reward_zero_std": 0.5, "grad_norm": 0.7767784595489502, "kl": 0.002992647117935121, "learning_rate": 9.974971001766532e-07, "loss": 2.9881102818762884e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 447.0625, "completions/min_length": 380.0, "epoch": 2.575, "frac_reward_zero_std": 1.0, "grad_norm": 0.010290349833667278, "kl": 0.003546177555108443, "learning_rate": 9.974842591254557e-07, "loss": 3.493791518849321e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 454.0625, "completions/min_length": 416.0, "epoch": 2.576470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.006540242582559586, "kl": 0.0033856655936688185, "learning_rate": 9.974713853012545e-07, "loss": 3.3867447200464085e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 449.875, "completions/min_length": 349.0, "epoch": 2.5779411764705884, "frac_reward_zero_std": 0.5, "grad_norm": 0.9761896729469299, "kl": 0.004525361233390868, "learning_rate": 9.97458478704897e-07, "loss": 4.513710882747546e-05, "reward": 0.8925000429153442, "reward_std": 0.14349238574504852, "rewards/DrugCombAccuracyCOTORM/mean": 0.8708333373069763, "rewards/DrugCombAccuracyCOTORM/std": 0.2681058645248413, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 1753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 501.875, "completions/min_length": 394.0, "epoch": 2.5794117647058825, "frac_reward_zero_std": 1.0, "grad_norm": 0.010834887623786926, "kl": 0.003759771992918104, "learning_rate": 9.97445539337234e-07, "loss": 3.732233017217368e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 1754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 403.8125, "completions/min_length": 354.0, "epoch": 2.5808823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.015204569324851036, "kl": 0.004111060465220362, "learning_rate": 9.974325671991178e-07, "loss": 4.083964813617058e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 478.6875, "completions/min_length": 421.0, "epoch": 2.5823529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 1.1754860877990723, "kl": 0.005814496427774429, "learning_rate": 9.974195622914028e-07, "loss": 5.8300793170928955e-05, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 1756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 456.5, "completions/min_length": 405.0, "epoch": 2.583823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 1.4619255065917969, "kl": 0.004271532176062465, "learning_rate": 9.974065246149458e-07, "loss": 4.275888204574585e-05, "reward": 0.45624998211860657, "reward_std": 0.1944543719291687, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0625, "rewards/DrugCombCoverageCOTORM/std": 0.9979145526885986, "step": 1757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 497.8125, "completions/min_length": 452.0, "epoch": 2.585294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.8705642819404602, "kl": 0.002979264478199184, "learning_rate": 9.97393454170606e-07, "loss": 2.985600076499395e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 444.0, "completions/min_length": 367.0, "epoch": 2.586764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9729310870170593, "kl": 0.004178797884378582, "learning_rate": 9.973803509592437e-07, "loss": 4.147125582676381e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 1759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 428.375, "completions/min_length": 339.0, "epoch": 2.588235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.010631069540977478, "kl": 0.00413657579338178, "learning_rate": 9.97367214981723e-07, "loss": 4.148252628510818e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 444.8125, "completions/min_length": 406.0, "epoch": 2.5897058823529413, "frac_reward_zero_std": 0.5, "grad_norm": 0.9223132133483887, "kl": 0.004023641464300454, "learning_rate": 9.973540462389087e-07, "loss": 4.0411949157714844e-05, "reward": 0.75, "reward_std": 0.20701967179775238, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 436.4375, "completions/min_length": 390.0, "epoch": 2.5911764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.009660289622843266, "kl": 0.004351860727183521, "learning_rate": 9.973408447316686e-07, "loss": 4.359476224635728e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 478.3125, "completions/min_length": 448.0, "epoch": 2.5926470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.3290259838104248, "kl": 0.005249909299891442, "learning_rate": 9.973276104608723e-07, "loss": 5.2347779273986816e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 442.9375, "completions/min_length": 409.0, "epoch": 2.5941176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.007665112614631653, "kl": 0.004037983657326549, "learning_rate": 9.973143434273914e-07, "loss": 4.041352804051712e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 454.5625, "completions/min_length": 402.0, "epoch": 2.5955882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.006712908390909433, "kl": 0.0033304806565865874, "learning_rate": 9.973010436321003e-07, "loss": 3.335202927701175e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 456.6875, "completions/min_length": 386.0, "epoch": 2.597058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.327120065689087, "kl": 0.004610532429069281, "learning_rate": 9.972877110758748e-07, "loss": 4.534423351287842e-05, "reward": 0.8937499523162842, "reward_std": 0.3005203604698181, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 436.5625, "completions/min_length": 401.0, "epoch": 2.598529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.014352110214531422, "kl": 0.004122469108551741, "learning_rate": 9.972743457595935e-07, "loss": 4.126105704926886e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/mean_length": 470.75, "completions/min_length": 399.0, "epoch": 2.6, "frac_reward_zero_std": 0.5, "grad_norm": 1.1957286596298218, "kl": 0.00674892554525286, "learning_rate": 9.972609476841365e-07, "loss": 6.781518459320068e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/mean_length": 476.6875, "completions/min_length": 407.0, "epoch": 2.601470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.3137884140014648, "kl": 0.005567373475059867, "learning_rate": 9.97247516850387e-07, "loss": 5.494803190231323e-05, "reward": 0.5502499938011169, "reward_std": 0.23522986471652985, "rewards/DrugCombAccuracyCOTORM/mean": 0.4925000071525574, "rewards/DrugCombAccuracyCOTORM/std": 0.46994325518608093, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.6635342836380005, "step": 1769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 435.9375, "completions/min_length": 388.0, "epoch": 2.6029411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.008751019835472107, "kl": 0.003225382708478719, "learning_rate": 9.972340532592295e-07, "loss": 3.2163130526896566e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 430.6875, "completions/min_length": 365.0, "epoch": 2.6044117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.2029504776000977, "kl": 0.003263959486503154, "learning_rate": 9.972205569115506e-07, "loss": 3.261491656303406e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/mean_length": 410.1875, "completions/min_length": 356.0, "epoch": 2.6058823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.021437250077724457, "kl": 0.004384332511108369, "learning_rate": 9.972070278082398e-07, "loss": 4.404167702887207e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 449.3125, "completions/min_length": 398.0, "epoch": 2.6073529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.014862829819321632, "kl": 0.00826759950723499, "learning_rate": 9.971934659501883e-07, "loss": 8.119756239466369e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 448.5625, "completions/min_length": 375.0, "epoch": 2.6088235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.009135310538113117, "kl": 0.0040152150322683156, "learning_rate": 9.971798713382896e-07, "loss": 4.005609662272036e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 449.625, "completions/min_length": 406.0, "epoch": 2.610294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.1475775241851807, "kl": 0.004139641998335719, "learning_rate": 9.97166243973439e-07, "loss": 4.110619192942977e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 1775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 445.6875, "completions/min_length": 381.0, "epoch": 2.611764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.02392006106674671, "kl": 0.005229769391007721, "learning_rate": 9.971525838565348e-07, "loss": 5.29392491444014e-05, "reward": 0.6865000128746033, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6237499713897705, "rewards/DrugCombAccuracyCOTORM/std": 0.38858935236930847, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.12909944355487823, "step": 1776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/mean_length": 513.25, "completions/min_length": 426.0, "epoch": 2.613235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.3222445249557495, "kl": 0.004430833098012954, "learning_rate": 9.971388909884762e-07, "loss": 4.570186138153076e-05, "reward": 0.46488097310066223, "reward_std": 0.41399407386779785, "rewards/DrugCombAccuracyCOTORM/mean": 0.398809552192688, "rewards/DrugCombAccuracyCOTORM/std": 0.43812114000320435, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4583333432674408, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 1777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/mean_length": 497.4375, "completions/min_length": 412.0, "epoch": 2.614705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.155318021774292, "kl": 0.0054282809142023325, "learning_rate": 9.971251653701655e-07, "loss": 5.444389535114169e-05, "reward": 0.9800000190734863, "reward_std": 0.037032779306173325, "rewards/DrugCombAccuracyCOTORM/mean": 0.9750000238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.06831300258636475, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 479.9375, "completions/min_length": 385.0, "epoch": 2.6161764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.097190022468567, "kl": 0.004930495342705399, "learning_rate": 9.971114070025071e-07, "loss": 4.940909275319427e-05, "reward": 0.20624999701976776, "reward_std": 0.21619683504104614, "rewards/DrugCombAccuracyCOTORM/mean": 0.125, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0625, "rewards/DrugCombCoverageCOTORM/std": 0.6800735592842102, "step": 1779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 448.4375, "completions/min_length": 403.0, "epoch": 2.6176470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.194235920906067, "kl": 0.005413906066678464, "learning_rate": 9.970976158864074e-07, "loss": 5.412119935499504e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/mean_length": 413.5, "completions/min_length": 369.0, "epoch": 2.6191176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.26107656955719, "kl": 0.004449599713552743, "learning_rate": 9.970837920227744e-07, "loss": 4.459460615180433e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 466.5625, "completions/min_length": 429.0, "epoch": 2.6205882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.00970708392560482, "kl": 0.004050812742207199, "learning_rate": 9.970699354125193e-07, "loss": 4.0590075514046475e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 416.5, "completions/min_length": 322.0, "epoch": 2.6220588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9614633917808533, "kl": 0.004163214121945202, "learning_rate": 9.970560460565549e-07, "loss": 4.1618943214416504e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 1783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 477.5, "completions/min_length": 395.0, "epoch": 2.623529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.6763129234313965, "kl": 0.003982132126111537, "learning_rate": 9.97042123955796e-07, "loss": 3.976374864578247e-05, "reward": 0.5718750357627869, "reward_std": 0.07954951375722885, "rewards/DrugCombAccuracyCOTORM/mean": 0.46875, "rewards/DrugCombAccuracyCOTORM/std": 0.4989572763442993, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 1784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 432.0625, "completions/min_length": 368.0, "epoch": 2.625, "frac_reward_zero_std": 0.5, "grad_norm": 1.1589449644088745, "kl": 0.0046234651817940176, "learning_rate": 9.970281691111597e-07, "loss": 4.637284655473195e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 434.0, "completions/min_length": 370.0, "epoch": 2.626470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9439301490783691, "kl": 0.0035758489393629134, "learning_rate": 9.970141815235657e-07, "loss": 3.553926944732666e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 466.25, "completions/min_length": 420.0, "epoch": 2.6279411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.0076960325241089, "kl": 0.004207019810564816, "learning_rate": 9.97000161193935e-07, "loss": 4.219673792249523e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 481.5, "completions/min_length": 423.0, "epoch": 2.6294117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.009006339125335217, "kl": 0.0050695466925390065, "learning_rate": 9.969861081231916e-07, "loss": 5.049070023233071e-05, "reward": 0.6410000324249268, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5824999809265137, "rewards/DrugCombAccuracyCOTORM/std": 0.43119215965270996, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 1788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/mean_length": 495.4375, "completions/min_length": 398.0, "epoch": 2.6308823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.8070102334022522, "kl": 0.0036713219597004354, "learning_rate": 9.96972022312261e-07, "loss": 3.700250090332702e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 435.25, "completions/min_length": 382.0, "epoch": 2.6323529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.009799106046557426, "kl": 0.004508728918153793, "learning_rate": 9.969579037620713e-07, "loss": 4.491224535740912e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/mean_length": 504.875, "completions/min_length": 427.0, "epoch": 2.6338235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1215486526489258, "kl": 0.0037499414174817502, "learning_rate": 9.969437524735524e-07, "loss": 3.732621917151846e-05, "reward": 0.7365216016769409, "reward_std": 0.1925237476825714, "rewards/DrugCombAccuracyCOTORM/mean": 0.6894019842147827, "rewards/DrugCombAccuracyCOTORM/std": 0.44433972239494324, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8500000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.23664319515228271, "step": 1791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 421.6875, "completions/min_length": 400.0, "epoch": 2.635294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.8573672771453857, "kl": 0.004802901647053659, "learning_rate": 9.969295684476368e-07, "loss": 4.786123463418335e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 436.5625, "completions/min_length": 351.0, "epoch": 2.636764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.013918586075305939, "kl": 0.0049688866711221635, "learning_rate": 9.969153516852588e-07, "loss": 4.9523736379342154e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 413.4375, "completions/min_length": 376.0, "epoch": 2.638235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.010012401267886162, "kl": 0.003917267604265362, "learning_rate": 9.96901102187355e-07, "loss": 3.92824767914135e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 425.3125, "completions/min_length": 351.0, "epoch": 2.639705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.05318423733115196, "kl": 0.005204534740187228, "learning_rate": 9.96886819954864e-07, "loss": 5.1566294132499024e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 433.75, "completions/min_length": 393.0, "epoch": 2.6411764705882352, "frac_reward_zero_std": 1.0, "grad_norm": 0.02054891735315323, "kl": 0.005343329976312816, "learning_rate": 9.968725049887267e-07, "loss": 5.4157608246896416e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 487.5625, "completions/min_length": 417.0, "epoch": 2.6426470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 1.1081148386001587, "kl": 0.004949091991875321, "learning_rate": 9.968581572898865e-07, "loss": 4.956402335665189e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 403.0, "completions/min_length": 359.0, "epoch": 2.6441176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.0007364749908447, "kl": 0.003613431821577251, "learning_rate": 9.968437768592881e-07, "loss": 3.6083205486647785e-05, "reward": 0.75, "reward_std": 0.26726123690605164, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 1798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 467.9375, "completions/min_length": 421.0, "epoch": 2.6455882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.012585846707224846, "kl": 0.004019726999104023, "learning_rate": 9.968293636978792e-07, "loss": 4.035299934912473e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 768.0, "completions/mean_length": 496.75, "completions/min_length": 363.0, "epoch": 2.6470588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.0669463872909546, "kl": 0.004015685350168496, "learning_rate": 9.968149178066087e-07, "loss": 4.072210140293464e-05, "reward": 0.8002350330352783, "reward_std": 0.217046856880188, "rewards/DrugCombAccuracyCOTORM/mean": 0.7727547287940979, "rewards/DrugCombAccuracyCOTORM/std": 0.4093123972415924, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8203125, "rewards/DrugCombCoverageCOTORM/std": 0.3534613251686096, "step": 1800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/mean_length": 485.875, "completions/min_length": 414.0, "epoch": 2.648529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9245598316192627, "kl": 0.0039836575742810965, "learning_rate": 9.968004391864291e-07, "loss": 3.972649574279785e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/mean_length": 471.0625, "completions/min_length": 351.0, "epoch": 2.65, "frac_reward_zero_std": 0.5, "grad_norm": 0.9062196612358093, "kl": 0.003723033471032977, "learning_rate": 9.967859278382937e-07, "loss": 3.7096673622727394e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 446.375, "completions/min_length": 406.0, "epoch": 2.651470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.00755238626152277, "kl": 0.0038131807232275605, "learning_rate": 9.967713837631587e-07, "loss": 3.780741826631129e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 430.5625, "completions/min_length": 362.0, "epoch": 2.652941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.3438581228256226, "kl": 0.004508457845076919, "learning_rate": 9.967568069619818e-07, "loss": 4.564225673675537e-05, "reward": 0.699999988079071, "reward_std": 0.3484410345554352, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 444.125, "completions/min_length": 304.0, "epoch": 2.6544117647058822, "frac_reward_zero_std": 0.5, "grad_norm": 0.9387907385826111, "kl": 0.005164034373592585, "learning_rate": 9.96742197435724e-07, "loss": 5.122640868648887e-05, "reward": 0.8125, "reward_std": 0.172688826918602, "rewards/DrugCombAccuracyCOTORM/mean": 0.78125, "rewards/DrugCombAccuracyCOTORM/std": 0.36371922492980957, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 1805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 444.9375, "completions/min_length": 390.0, "epoch": 2.6558823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.1433228254318237, "kl": 0.003959430556278676, "learning_rate": 9.96727555185347e-07, "loss": 3.949389429180883e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 437.8125, "completions/min_length": 397.0, "epoch": 2.6573529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.012188432738184929, "kl": 0.004054685530718416, "learning_rate": 9.967128802118157e-07, "loss": 4.042076398036443e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 456.8125, "completions/min_length": 373.0, "epoch": 2.6588235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 1.0154320001602173, "kl": 0.0035142666311003268, "learning_rate": 9.96698172516097e-07, "loss": 3.506673965603113e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 450.5, "completions/min_length": 414.0, "epoch": 2.6602941176470587, "frac_reward_zero_std": 1.0, "grad_norm": 0.047408442944288254, "kl": 0.0032850339775905013, "learning_rate": 9.966834320991598e-07, "loss": 3.278147414675914e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 458.9375, "completions/min_length": 372.0, "epoch": 2.661764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.4283922910690308, "kl": 0.0037888973020017147, "learning_rate": 9.966686589619748e-07, "loss": 3.819167613983154e-05, "reward": 0.550000011920929, "reward_std": 0.3265853822231293, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 476.75, "completions/min_length": 385.0, "epoch": 2.663235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.2305790185928345, "kl": 0.003422281239181757, "learning_rate": 9.966538531055157e-07, "loss": 3.40893748216331e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 462.5, "completions/min_length": 400.0, "epoch": 2.664705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.172593355178833, "kl": 0.005428745469544083, "learning_rate": 9.966390145307574e-07, "loss": 5.4217765864450485e-05, "reward": 0.7749999761581421, "reward_std": 0.24348656833171844, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 1812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/mean_length": 477.375, "completions/min_length": 383.0, "epoch": 2.666176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 1.3247289657592773, "kl": 0.00452736159786582, "learning_rate": 9.96624143238678e-07, "loss": 4.5612454414367676e-05, "reward": 0.6500000357627869, "reward_std": 0.34503281116485596, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.4669641852378845, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 527.3125, "completions/min_length": 445.0, "epoch": 2.6676470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 0.9819633364677429, "kl": 0.005369223246816546, "learning_rate": 9.966092392302565e-07, "loss": 5.3974414186086506e-05, "reward": 0.24419793486595154, "reward_std": 0.018509112298488617, "rewards/DrugCombAccuracyCOTORM/mean": 0.08561383932828903, "rewards/DrugCombAccuracyCOTORM/std": 0.09128844738006592, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7570684552192688, "rewards/DrugCombCoverageCOTORM/std": 0.26093995571136475, "step": 1814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 460.4375, "completions/min_length": 401.0, "epoch": 2.6691176470588234, "frac_reward_zero_std": 0.5, "grad_norm": 1.252598524093628, "kl": 0.005939563270658255, "learning_rate": 9.965943025064753e-07, "loss": 5.8957451983587816e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/mean_length": 506.5625, "completions/min_length": 428.0, "epoch": 2.6705882352941175, "frac_reward_zero_std": 0.5, "grad_norm": 0.9119126200675964, "kl": 0.004530766454990953, "learning_rate": 9.96579333068318e-07, "loss": 4.529207944869995e-05, "reward": 0.8800417184829712, "reward_std": 0.15145014226436615, "rewards/DrugCombAccuracyCOTORM/mean": 0.8539583683013916, "rewards/DrugCombAccuracyCOTORM/std": 0.29159578680992126, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.08539126068353653, "step": 1816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 435.5, "completions/min_length": 404.0, "epoch": 2.6720588235294116, "frac_reward_zero_std": 1.0, "grad_norm": 0.007995459251105785, "kl": 0.0039814814226701856, "learning_rate": 9.96564330916771e-07, "loss": 3.9373862819047645e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/mean_length": 510.1875, "completions/min_length": 378.0, "epoch": 2.673529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.1282461881637573, "kl": 0.00812965986551717, "learning_rate": 9.965492960528227e-07, "loss": 8.166788029484451e-05, "reward": 0.45000001788139343, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 1818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 441.875, "completions/min_length": 419.0, "epoch": 2.675, "frac_reward_zero_std": 0.5, "grad_norm": 1.1116241216659546, "kl": 0.006511444633360952, "learning_rate": 9.965342284774631e-07, "loss": 6.489226507255808e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 438.0625, "completions/min_length": 344.0, "epoch": 2.6764705882352944, "frac_reward_zero_std": 1.0, "grad_norm": 0.011439746245741844, "kl": 0.003419410262722522, "learning_rate": 9.96519128191685e-07, "loss": 3.427401679800823e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 447.6875, "completions/min_length": 393.0, "epoch": 2.677941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.014167958870530128, "kl": 0.004989317152649164, "learning_rate": 9.965039951964838e-07, "loss": 4.978537981514819e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/mean_length": 474.0625, "completions/min_length": 375.0, "epoch": 2.6794117647058826, "frac_reward_zero_std": 0.5, "grad_norm": 0.9215858578681946, "kl": 0.0034595829783938825, "learning_rate": 9.964888294928556e-07, "loss": 3.477343125268817e-05, "reward": 0.8176041841506958, "reward_std": 0.22900554537773132, "rewards/DrugCombAccuracyCOTORM/mean": 0.800000011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.4000000059604645, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5426273941993713, "step": 1822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 429.625, "completions/min_length": 362.0, "epoch": 2.6808823529411763, "frac_reward_zero_std": 1.0, "grad_norm": 0.008367921225726604, "kl": 0.004217701207380742, "learning_rate": 9.964736310817995e-07, "loss": 4.197354428470135e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 458.4375, "completions/min_length": 378.0, "epoch": 2.682352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.9738666415214539, "kl": 0.007129304809495807, "learning_rate": 9.964583999643174e-07, "loss": 7.04147678334266e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 471.25, "completions/min_length": 389.0, "epoch": 2.6838235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 0.9557054042816162, "kl": 0.004617927013896406, "learning_rate": 9.96443136141412e-07, "loss": 4.573643673211336e-05, "reward": 0.8925000429153442, "reward_std": 0.14349240064620972, "rewards/DrugCombAccuracyCOTORM/mean": 0.8708333373069763, "rewards/DrugCombAccuracyCOTORM/std": 0.2681058645248413, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 1825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/mean_length": 390.625, "completions/min_length": 356.0, "epoch": 2.685294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.4085253477096558, "kl": 0.005961001385003328, "learning_rate": 9.964278396140893e-07, "loss": 5.965679883956909e-05, "reward": 0.5874999761581421, "reward_std": 0.0353553369641304, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 1826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 452.625, "completions/min_length": 398.0, "epoch": 2.6867647058823527, "frac_reward_zero_std": 0.0, "grad_norm": 1.5693473815917969, "kl": 0.004111453541554511, "learning_rate": 9.96412510383357e-07, "loss": 4.094839096069336e-05, "reward": 0.8089166879653931, "reward_std": 0.3232209086418152, "rewards/DrugCombAccuracyCOTORM/mean": 0.7637500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.42547035217285156, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 1827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 476.9375, "completions/min_length": 405.0, "epoch": 2.6882352941176473, "frac_reward_zero_std": 0.0, "grad_norm": 1.4487872123718262, "kl": 0.0046148248366080225, "learning_rate": 9.963971484502244e-07, "loss": 4.60892915725708e-05, "reward": 0.7840999960899353, "reward_std": 0.3662574589252472, "rewards/DrugCombAccuracyCOTORM/mean": 0.7582499980926514, "rewards/DrugCombAccuracyCOTORM/std": 0.43346166610717773, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7749999761581421, "rewards/DrugCombCoverageCOTORM/std": 0.41231057047843933, "step": 1828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 486.375, "completions/min_length": 375.0, "epoch": 2.689705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9591171145439148, "kl": 0.0035566737642511725, "learning_rate": 9.96381753815704e-07, "loss": 3.56137752532959e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 466.5, "completions/min_length": 421.0, "epoch": 2.6911764705882355, "frac_reward_zero_std": 0.5, "grad_norm": 1.1662116050720215, "kl": 0.003730973054189235, "learning_rate": 9.9636632648081e-07, "loss": 3.731772812898271e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 1830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 436.1875, "completions/min_length": 401.0, "epoch": 2.692647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.00992378406226635, "kl": 0.005160311120562255, "learning_rate": 9.963508664465585e-07, "loss": 5.140723078511655e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 424.125, "completions/min_length": 372.0, "epoch": 2.6941176470588237, "frac_reward_zero_std": 1.0, "grad_norm": 0.00993600208312273, "kl": 0.0036033050855621696, "learning_rate": 9.963353737139679e-07, "loss": 3.602011202019639e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/mean_length": 476.6875, "completions/min_length": 433.0, "epoch": 2.6955882352941174, "frac_reward_zero_std": 0.5, "grad_norm": 0.9958650469779968, "kl": 0.004432399640791118, "learning_rate": 9.963198482840592e-07, "loss": 4.433431604411453e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 1833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 476.1875, "completions/min_length": 426.0, "epoch": 2.697058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.7320551872253418, "kl": 0.004121243371628225, "learning_rate": 9.963042901578547e-07, "loss": 4.1157007217407227e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 500.5625, "completions/min_length": 418.0, "epoch": 2.6985294117647056, "frac_reward_zero_std": 0.5, "grad_norm": 1.0950521230697632, "kl": 0.004859864478930831, "learning_rate": 9.962886993363795e-07, "loss": 4.872580029768869e-05, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 1835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 450.8125, "completions/min_length": 378.0, "epoch": 2.7, "frac_reward_zero_std": 1.0, "grad_norm": 0.006101869512349367, "kl": 0.0031163827516138554, "learning_rate": 9.96273075820661e-07, "loss": 3.1230636523105204e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 506.6875, "completions/min_length": 449.0, "epoch": 2.701470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8333719372749329, "kl": 0.004662054707296193, "learning_rate": 9.96257419611728e-07, "loss": 4.684925079345703e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 477.625, "completions/min_length": 363.0, "epoch": 2.7029411764705884, "frac_reward_zero_std": 1.0, "grad_norm": 0.006322186440229416, "kl": 0.0034745947341434658, "learning_rate": 9.962417307106122e-07, "loss": 3.482877218630165e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 495.5, "completions/min_length": 444.0, "epoch": 2.7044117647058825, "frac_reward_zero_std": 1.0, "grad_norm": 0.016763074323534966, "kl": 0.004934986704029143, "learning_rate": 9.962260091183467e-07, "loss": 4.910939605906606e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 492.625, "completions/min_length": 422.0, "epoch": 2.7058823529411766, "frac_reward_zero_std": 0.0, "grad_norm": 1.6148842573165894, "kl": 0.0056679603876546025, "learning_rate": 9.96210254835968e-07, "loss": 5.602836608886719e-05, "reward": 0.3687499761581421, "reward_std": 0.3932233452796936, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.6718547940254211, "step": 1840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/mean_length": 520.875, "completions/min_length": 458.0, "epoch": 2.7073529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 0.9927424788475037, "kl": 0.0038364195497706532, "learning_rate": 9.96194467864513e-07, "loss": 3.820657730102539e-05, "reward": 0.6666666865348816, "reward_std": 0.12344269454479218, "rewards/DrugCombAccuracyCOTORM/mean": 0.5833333134651184, "rewards/DrugCombAccuracyCOTORM/std": 0.4791968762874603, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 411.5625, "completions/min_length": 349.0, "epoch": 2.708823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.4180315732955933, "kl": 0.004632745927665383, "learning_rate": 9.961786482050223e-07, "loss": 4.595857535605319e-05, "reward": 0.942187488079071, "reward_std": 0.16351844370365143, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/mean_length": 500.25, "completions/min_length": 399.0, "epoch": 2.710294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9774418473243713, "kl": 0.005725386552512646, "learning_rate": 9.961627958585381e-07, "loss": 5.9232552303001285e-05, "reward": 0.6993916630744934, "reward_std": 0.0031397968996316195, "rewards/DrugCombAccuracyCOTORM/mean": 0.6354374885559082, "rewards/DrugCombAccuracyCOTORM/std": 0.37653860449790955, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9104166626930237, "rewards/DrugCombCoverageCOTORM/std": 0.09326882660388947, "step": 1843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/mean_length": 545.875, "completions/min_length": 423.0, "epoch": 2.711764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.6544827222824097, "kl": 0.005034811212681234, "learning_rate": 9.961469108261045e-07, "loss": 5.05298376083374e-05, "reward": 0.7449448704719543, "reward_std": 0.3308257758617401, "rewards/DrugCombAccuracyCOTORM/mean": 0.7130821347236633, "rewards/DrugCombAccuracyCOTORM/std": 0.3476674258708954, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7447916865348816, "rewards/DrugCombCoverageCOTORM/std": 0.6814124584197998, "step": 1844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 450.3125, "completions/min_length": 409.0, "epoch": 2.713235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.2743512392044067, "kl": 0.00438949407543987, "learning_rate": 9.961309931087679e-07, "loss": 4.37755516031757e-05, "reward": 0.6678333282470703, "reward_std": 0.21822859346866608, "rewards/DrugCombAccuracyCOTORM/mean": 0.6525000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.46795299649238586, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4583333432674408, "rewards/DrugCombCoverageCOTORM/std": 0.8766518831253052, "step": 1845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 482.625, "completions/min_length": 401.0, "epoch": 2.7147058823529413, "frac_reward_zero_std": 0.5, "grad_norm": 0.9451746940612793, "kl": 0.0033692498691380024, "learning_rate": 9.96115042707577e-07, "loss": 3.346418452565558e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 1846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 482.375, "completions/min_length": 433.0, "epoch": 2.7161764705882354, "frac_reward_zero_std": 0.0, "grad_norm": 1.274519681930542, "kl": 0.003981132700573653, "learning_rate": 9.960990596235824e-07, "loss": 3.952533006668091e-05, "reward": 0.4127333462238312, "reward_std": 0.33793962001800537, "rewards/DrugCombAccuracyCOTORM/mean": 0.28987500071525574, "rewards/DrugCombAccuracyCOTORM/std": 0.4271889328956604, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8083333373069763, "rewards/DrugCombCoverageCOTORM/std": 0.2940143644809723, "step": 1847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 436.0625, "completions/min_length": 369.0, "epoch": 2.7176470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.038104772567749, "kl": 0.003948805038817227, "learning_rate": 9.960830438578378e-07, "loss": 3.9049366023391485e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 434.625, "completions/min_length": 386.0, "epoch": 2.7191176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.009077932685613632, "kl": 0.003937069559469819, "learning_rate": 9.960669954113972e-07, "loss": 3.9462262066081166e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 470.125, "completions/min_length": 423.0, "epoch": 2.7205882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.008433302864432335, "kl": 0.003532874572556466, "learning_rate": 9.960509142853187e-07, "loss": 3.5154000215698034e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 463.6875, "completions/min_length": 391.0, "epoch": 2.722058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.01050583180040121, "kl": 0.004326274909544736, "learning_rate": 9.960348004806608e-07, "loss": 4.310687290853821e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 1851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/mean_length": 529.75, "completions/min_length": 455.0, "epoch": 2.723529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9429216384887695, "kl": 0.004794497741386294, "learning_rate": 9.960186539984861e-07, "loss": 4.757710485137068e-05, "reward": 0.6779175400733948, "reward_std": 0.08098118752241135, "rewards/DrugCombAccuracyCOTORM/mean": 0.6182302832603455, "rewards/DrugCombAccuracyCOTORM/std": 0.41363269090652466, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.21081852912902832, "step": 1852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/mean_length": 521.375, "completions/min_length": 418.0, "epoch": 2.725, "frac_reward_zero_std": 0.5, "grad_norm": 0.8452778458595276, "kl": 0.005184917477890849, "learning_rate": 9.960024748398575e-07, "loss": 5.187094211578369e-05, "reward": 0.763200044631958, "reward_std": 0.11103242635726929, "rewards/DrugCombAccuracyCOTORM/mean": 0.7352499961853027, "rewards/DrugCombAccuracyCOTORM/std": 0.33162620663642883, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.27962350845336914, "step": 1853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/mean_length": 493.1875, "completions/min_length": 437.0, "epoch": 2.726470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.5431458950042725, "kl": 0.004963980638422072, "learning_rate": 9.95986263005841e-07, "loss": 4.9427151679992676e-05, "reward": 0.31858453154563904, "reward_std": 0.3409861624240875, "rewards/DrugCombAccuracyCOTORM/mean": 0.2198452353477478, "rewards/DrugCombAccuracyCOTORM/std": 0.3773631155490875, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4270833432674408, "rewards/DrugCombCoverageCOTORM/std": 0.8648886680603027, "step": 1854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 449.1875, "completions/min_length": 352.0, "epoch": 2.7279411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.117671251296997, "kl": 0.0062099494389258325, "learning_rate": 9.959700184975048e-07, "loss": 6.189601117512211e-05, "reward": 0.32500001788139343, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.25, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 1855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 411.1875, "completions/min_length": 365.0, "epoch": 2.7294117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.02568625658750534, "kl": 0.0052894530817866325, "learning_rate": 9.959537413159188e-07, "loss": 5.2334369684103876e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 450.75, "completions/min_length": 392.0, "epoch": 2.7308823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.0283039808273315, "kl": 0.004070906667038798, "learning_rate": 9.959374314621556e-07, "loss": 4.0762126445770264e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 440.625, "completions/min_length": 324.0, "epoch": 2.7323529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.008515531197190285, "kl": 0.0036518899723887444, "learning_rate": 9.959210889372895e-07, "loss": 3.651501901913434e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/mean_length": 519.75, "completions/min_length": 438.0, "epoch": 2.7338235294117648, "frac_reward_zero_std": 0.0, "grad_norm": 2.348959445953369, "kl": 0.004818313638679683, "learning_rate": 9.95904713742397e-07, "loss": 4.788488149642944e-05, "reward": 0.44218751788139343, "reward_std": 0.38617488741874695, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.6291528940200806, "step": 1859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/mean_length": 475.8125, "completions/min_length": 373.0, "epoch": 2.735294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.009715968742966652, "kl": 0.00415611919015646, "learning_rate": 9.958883058785569e-07, "loss": 4.170848842477426e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 431.375, "completions/min_length": 364.0, "epoch": 2.736764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.040462613105774, "kl": 0.00499800662510097, "learning_rate": 9.9587186534685e-07, "loss": 4.938989877700806e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 444.5, "completions/min_length": 367.0, "epoch": 2.738235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0641478300094604, "kl": 0.0036898372345604002, "learning_rate": 9.958553921483598e-07, "loss": 3.675546759041026e-05, "reward": 0.574999988079071, "reward_std": 0.04629100486636162, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 1862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 487.25, "completions/min_length": 439.0, "epoch": 2.739705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.04204312339425087, "kl": 0.005624722340144217, "learning_rate": 9.958388862841712e-07, "loss": 5.454559868667275e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 480.375, "completions/min_length": 395.0, "epoch": 2.7411764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9931890964508057, "kl": 0.0033758548088371754, "learning_rate": 9.958223477553714e-07, "loss": 3.342609852552414e-05, "reward": 0.8988749980926514, "reward_std": 0.18881995975971222, "rewards/DrugCombAccuracyCOTORM/mean": 0.8853124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.314830482006073, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.2719528079032898, "step": 1864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 472.4375, "completions/min_length": 366.0, "epoch": 2.7426470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.1692516803741455, "kl": 0.0059970851289108396, "learning_rate": 9.958057765630502e-07, "loss": 5.9628247981891036e-05, "reward": 0.550000011920929, "reward_std": 0.053452249616384506, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 1865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/mean_length": 479.8125, "completions/min_length": 365.0, "epoch": 2.7441176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.940621018409729, "kl": 0.004199849732685834, "learning_rate": 9.95789172708299e-07, "loss": 4.175923822913319e-05, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 1866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/mean_length": 546.1875, "completions/min_length": 386.0, "epoch": 2.7455882352941177, "frac_reward_zero_std": 0.0, "grad_norm": 1.2457249164581299, "kl": 0.004202701791655272, "learning_rate": 9.95772536192212e-07, "loss": 4.223734140396118e-05, "reward": 0.49270835518836975, "reward_std": 0.2108047753572464, "rewards/DrugCombAccuracyCOTORM/mean": 0.40625, "rewards/DrugCombAccuracyCOTORM/std": 0.38953322172164917, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6770833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.5072392821311951, "step": 1867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/mean_length": 486.375, "completions/min_length": 353.0, "epoch": 2.7470588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.006304929964244366, "kl": 0.003159900545142591, "learning_rate": 9.957558670158848e-07, "loss": 3.153897705487907e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 400.4375, "completions/min_length": 301.0, "epoch": 2.748529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.01375030167400837, "kl": 0.004208933038171381, "learning_rate": 9.957391651804158e-07, "loss": 4.216684465063736e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 423.5625, "completions/min_length": 315.0, "epoch": 2.75, "frac_reward_zero_std": 1.0, "grad_norm": 0.010700162500143051, "kl": 0.0043985548545606434, "learning_rate": 9.957224306869053e-07, "loss": 4.3696614739019424e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 442.3125, "completions/min_length": 351.0, "epoch": 2.751470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.292829990386963, "kl": 0.007753578131087124, "learning_rate": 9.957056635364554e-07, "loss": 8.15466046333313e-05, "reward": 0.44999998807907104, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 1871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/mean_length": 538.0625, "completions/min_length": 414.0, "epoch": 2.7529411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375813007354736, "kl": 0.0047182736452668905, "learning_rate": 9.956888637301708e-07, "loss": 4.719197750091553e-05, "reward": 0.71875, "reward_std": 0.2298097014427185, "rewards/DrugCombAccuracyCOTORM/mean": 0.65625, "rewards/DrugCombAccuracyCOTORM/std": 0.3520771861076355, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 478.75, "completions/min_length": 415.0, "epoch": 2.7544117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.1125190258026123, "kl": 0.004020591790322214, "learning_rate": 9.956720312691584e-07, "loss": 4.05777245759964e-05, "reward": 0.8921874761581421, "reward_std": 0.20032759010791779, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 494.8125, "completions/min_length": 411.0, "epoch": 2.7558823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.7821910381317139, "kl": 0.0029355367878451943, "learning_rate": 9.956551661545268e-07, "loss": 2.9203722078818828e-05, "reward": 0.9239583015441895, "reward_std": 0.1609395444393158, "rewards/DrugCombAccuracyCOTORM/mean": 0.9166666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.2719528079032898, "step": 1874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 503.0, "completions/min_length": 477.0, "epoch": 2.7573529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.067224383354187, "kl": 0.004464895406272262, "learning_rate": 9.956382683873875e-07, "loss": 4.450710548553616e-05, "reward": 0.4000000059604645, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 418.375, "completions/min_length": 363.0, "epoch": 2.7588235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.009996776469051838, "kl": 0.004597395774908364, "learning_rate": 9.95621337968853e-07, "loss": 4.614565477822907e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 426.875, "completions/min_length": 357.0, "epoch": 2.760294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.009404955431818962, "kl": 0.004004197544418275, "learning_rate": 9.956043749000392e-07, "loss": 4.048419214086607e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 466.9375, "completions/min_length": 403.0, "epoch": 2.761764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.011656461283564568, "kl": 0.004626464797183871, "learning_rate": 9.955873791820635e-07, "loss": 4.62578100268729e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 418.3125, "completions/min_length": 355.0, "epoch": 2.763235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.009038152173161507, "kl": 0.003380274458322674, "learning_rate": 9.955703508160454e-07, "loss": 3.397018735995516e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 456.25, "completions/min_length": 395.0, "epoch": 2.764705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9957555532455444, "kl": 0.0035567019949667156, "learning_rate": 9.955532898031068e-07, "loss": 3.568953979993239e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 1880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/mean_length": 511.3125, "completions/min_length": 423.0, "epoch": 2.7661764705882352, "frac_reward_zero_std": 0.5, "grad_norm": 0.9331355690956116, "kl": 0.0051558942650444806, "learning_rate": 9.955361961443713e-07, "loss": 5.1531031203921884e-05, "reward": 0.6937500238418579, "reward_std": 0.1898072361946106, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 476.125, "completions/min_length": 425.0, "epoch": 2.7676470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 1.1293431520462036, "kl": 0.004290631739422679, "learning_rate": 9.955190698409655e-07, "loss": 4.2829662561416626e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 436.625, "completions/min_length": 393.0, "epoch": 2.7691176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.008952410891652107, "kl": 0.003921910538338125, "learning_rate": 9.955019108940172e-07, "loss": 3.9343263779301196e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 414.8125, "completions/min_length": 364.0, "epoch": 2.7705882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0075491345487535, "kl": 0.003767493413761258, "learning_rate": 9.95484719304657e-07, "loss": 3.767460657400079e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 451.1875, "completions/min_length": 368.0, "epoch": 2.7720588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.8108499646186829, "kl": 0.004434943723026663, "learning_rate": 9.954674950740173e-07, "loss": 4.457712930161506e-05, "reward": 0.656166672706604, "reward_std": 0.04289780929684639, "rewards/DrugCombAccuracyCOTORM/mean": 0.5962499976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.4203629493713379, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.2687419056892395, "step": 1885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/mean_length": 502.5, "completions/min_length": 401.0, "epoch": 2.773529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8822373747825623, "kl": 0.003381973918294534, "learning_rate": 9.954502382032332e-07, "loss": 3.4545784728834406e-05, "reward": 0.9551249742507935, "reward_std": 0.12692566215991974, "rewards/DrugCombAccuracyCOTORM/mean": 0.9478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.20874999463558197, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 1886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 476.9375, "completions/min_length": 430.0, "epoch": 2.775, "frac_reward_zero_std": 0.0, "grad_norm": 1.3944405317306519, "kl": 0.006358098995406181, "learning_rate": 9.95432948693441e-07, "loss": 6.350129842758179e-05, "reward": 0.3748333156108856, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.2549999952316284, "rewards/DrugCombAccuracyCOTORM/std": 0.2061067670583725, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7083333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385499686002731, "step": 1887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/mean_length": 526.9375, "completions/min_length": 434.0, "epoch": 2.776470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.0291849374771118, "kl": 0.004572069621644914, "learning_rate": 9.954156265457801e-07, "loss": 4.5565295295091346e-05, "reward": 0.7746666669845581, "reward_std": 0.11295006424188614, "rewards/DrugCombAccuracyCOTORM/mean": 0.7287499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.34013479948043823, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.08606630563735962, "step": 1888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 442.375, "completions/min_length": 411.0, "epoch": 2.777941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.009914005175232887, "kl": 0.0035413579898886383, "learning_rate": 9.953982717613914e-07, "loss": 3.5361772461328655e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 440.5, "completions/min_length": 362.0, "epoch": 2.7794117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.0487590990960598, "kl": 0.006028629723004997, "learning_rate": 9.953808843414182e-07, "loss": 6.102468614699319e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/mean_length": 491.875, "completions/min_length": 424.0, "epoch": 2.7808823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 1.324103832244873, "kl": 0.005398545356001705, "learning_rate": 9.95363464287006e-07, "loss": 5.4504722356796265e-05, "reward": 0.45922917127609253, "reward_std": 0.06186428666114807, "rewards/DrugCombAccuracyCOTORM/mean": 0.37351566553115845, "rewards/DrugCombAccuracyCOTORM/std": 0.28700995445251465, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6041666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.2846375107765198, "step": 1891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 448.0, "completions/min_length": 377.0, "epoch": 2.7823529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.2845873832702637, "kl": 0.005135856743436307, "learning_rate": 9.953460115993025e-07, "loss": 5.245372449280694e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 474.375, "completions/min_length": 430.0, "epoch": 2.7838235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.015321658924221992, "kl": 0.004902416549157351, "learning_rate": 9.95328526279457e-07, "loss": 4.8951111239148304e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/mean_length": 538.3125, "completions/min_length": 425.0, "epoch": 2.7852941176470587, "frac_reward_zero_std": 0.0, "grad_norm": 1.65585196018219, "kl": 0.01846982550341636, "learning_rate": 9.95311008328622e-07, "loss": 0.00018924474716186523, "reward": 0.2630833387374878, "reward_std": 0.23806285858154297, "rewards/DrugCombAccuracyCOTORM/mean": 0.13875000178813934, "rewards/DrugCombAccuracyCOTORM/std": 0.34062445163726807, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5208333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.438325971364975, "step": 1894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 438.875, "completions/min_length": 361.0, "epoch": 2.786764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9706829786300659, "kl": 0.005183876550290734, "learning_rate": 9.952934577479512e-07, "loss": 5.157716805115342e-05, "reward": 0.9833333492279053, "reward_std": 0.047140445560216904, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 437.625, "completions/min_length": 417.0, "epoch": 2.788235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.7274866104125977, "kl": 0.004766673664562404, "learning_rate": 9.952758745386007e-07, "loss": 4.728828935185447e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 442.8125, "completions/min_length": 383.0, "epoch": 2.789705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.015250966884195805, "kl": 0.004026066802907735, "learning_rate": 9.952582587017291e-07, "loss": 4.008461837656796e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 419.5625, "completions/min_length": 352.0, "epoch": 2.791176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.0298019647598267, "kl": 0.004103939339984208, "learning_rate": 9.95240610238497e-07, "loss": 4.105627158423886e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 463.0625, "completions/min_length": 383.0, "epoch": 2.7926470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 1.1612025499343872, "kl": 0.00506620854139328, "learning_rate": 9.952229291500664e-07, "loss": 5.07161021232605e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 1899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 498.3125, "completions/min_length": 420.0, "epoch": 2.7941176470588234, "frac_reward_zero_std": 0.5, "grad_norm": 0.9127403497695923, "kl": 0.0046317450469359756, "learning_rate": 9.952052154376024e-07, "loss": 4.608681774698198e-05, "reward": 0.6079999804496765, "reward_std": 0.0377245657145977, "rewards/DrugCombAccuracyCOTORM/mean": 0.5412499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.47761037945747375, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.3162277936935425, "step": 1900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 421.625, "completions/min_length": 386.0, "epoch": 2.7955882352941175, "frac_reward_zero_std": 0.0, "grad_norm": 1.744144320487976, "kl": 0.004965000902302563, "learning_rate": 9.951874691022724e-07, "loss": 4.9717724323272705e-05, "reward": 0.3812500238418579, "reward_std": 0.39963412284851074, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3125, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 1901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 420.5, "completions/min_length": 353.0, "epoch": 2.7970588235294116, "frac_reward_zero_std": 0.5, "grad_norm": 1.034958839416504, "kl": 0.005108594195917249, "learning_rate": 9.95169690145245e-07, "loss": 5.105906166136265e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 441.5625, "completions/min_length": 382.0, "epoch": 2.798529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.2283726930618286, "kl": 0.0054063788265921175, "learning_rate": 9.951518785676914e-07, "loss": 5.43445348739624e-05, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 1903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 491.375, "completions/min_length": 442.0, "epoch": 2.8, "frac_reward_zero_std": 0.5, "grad_norm": 0.9757170677185059, "kl": 0.003255050047300756, "learning_rate": 9.95134034370785e-07, "loss": 3.258081051171757e-05, "reward": 0.9551249742507935, "reward_std": 0.12692566215991974, "rewards/DrugCombAccuracyCOTORM/mean": 0.9478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.20874999463558197, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 1904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 405.0625, "completions/min_length": 301.0, "epoch": 2.8014705882352944, "frac_reward_zero_std": 1.0, "grad_norm": 0.011945231817662716, "kl": 0.00502695410978049, "learning_rate": 9.951161575557018e-07, "loss": 5.054751818533987e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/mean_length": 483.375, "completions/min_length": 398.0, "epoch": 2.802941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.0144143104553223, "kl": 0.005194652418140322, "learning_rate": 9.95098248123619e-07, "loss": 5.128607153892517e-05, "reward": 0.518750011920929, "reward_std": 0.03720119222998619, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.1875, "rewards/DrugCombCoverageCOTORM/std": 0.981070876121521, "step": 1906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 476.125, "completions/min_length": 422.0, "epoch": 2.8044117647058826, "frac_reward_zero_std": 1.0, "grad_norm": 0.007282726000994444, "kl": 0.003796238568611443, "learning_rate": 9.950803060757164e-07, "loss": 3.774344804696739e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 436.0625, "completions/min_length": 410.0, "epoch": 2.8058823529411763, "frac_reward_zero_std": 1.0, "grad_norm": 0.011791681870818138, "kl": 0.003937535220757127, "learning_rate": 9.950623314131762e-07, "loss": 3.9452457713196054e-05, "reward": 0.6713333129882812, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6100000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.40279027819633484, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 1908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 452.625, "completions/min_length": 416.0, "epoch": 2.807352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.006406453903764486, "kl": 0.003472906770184636, "learning_rate": 9.950443241371823e-07, "loss": 3.473063407000154e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/mean_length": 468.625, "completions/min_length": 389.0, "epoch": 2.8088235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 1.0927603244781494, "kl": 0.005409713077824563, "learning_rate": 9.950262842489214e-07, "loss": 5.455295104184188e-05, "reward": 0.5614583492279053, "reward_std": 0.012549505569040775, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6145833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.43341344594955444, "step": 1910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/mean_length": 478.875, "completions/min_length": 360.0, "epoch": 2.810294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.3344967365264893, "kl": 0.004019183863420039, "learning_rate": 9.950082117495814e-07, "loss": 4.019588232040405e-05, "reward": 0.6818749904632568, "reward_std": 0.38702690601348877, "rewards/DrugCombAccuracyCOTORM/mean": 0.614062488079071, "rewards/DrugCombAccuracyCOTORM/std": 0.45747578144073486, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.1717960685491562, "step": 1911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 434.75, "completions/min_length": 380.0, "epoch": 2.8117647058823527, "frac_reward_zero_std": 1.0, "grad_norm": 0.010909248143434525, "kl": 0.003822682425379753, "learning_rate": 9.949901066403534e-07, "loss": 3.8353566196747124e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 444.9375, "completions/min_length": 387.0, "epoch": 2.8132352941176473, "frac_reward_zero_std": 1.0, "grad_norm": 0.008439893834292889, "kl": 0.00390687893377617, "learning_rate": 9.949719689224296e-07, "loss": 3.9280359487747774e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/mean_length": 486.75, "completions/min_length": 398.0, "epoch": 2.814705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.021192070096731186, "kl": 0.005175292782951146, "learning_rate": 9.949537985970052e-07, "loss": 5.1958686526631936e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 449.375, "completions/min_length": 380.0, "epoch": 2.8161764705882355, "frac_reward_zero_std": 0.5, "grad_norm": 1.0300521850585938, "kl": 0.004386308777611703, "learning_rate": 9.949355956652773e-07, "loss": 4.3742358684539795e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 452.25, "completions/min_length": 408.0, "epoch": 2.817647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.0081907510757446, "kl": 0.004824506118893623, "learning_rate": 9.949173601284446e-07, "loss": 4.834485298488289e-05, "reward": 0.3500000238418579, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 1916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 518.4375, "completions/min_length": 454.0, "epoch": 2.8191176470588237, "frac_reward_zero_std": 0.5, "grad_norm": 0.9434002041816711, "kl": 0.004112257389351726, "learning_rate": 9.94899091987709e-07, "loss": 4.136849383939989e-05, "reward": 0.9623125195503235, "reward_std": 0.07315737754106522, "rewards/DrugCombAccuracyCOTORM/mean": 0.95549476146698, "rewards/DrugCombAccuracyCOTORM/std": 0.12726296484470367, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.05692751333117485, "step": 1917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 518.75, "completions/min_length": 463.0, "epoch": 2.8205882352941174, "frac_reward_zero_std": 0.0, "grad_norm": 1.1590481996536255, "kl": 0.004596762184519321, "learning_rate": 9.948807912442734e-07, "loss": 4.649907350540161e-05, "reward": 0.6317000389099121, "reward_std": 0.3159658908843994, "rewards/DrugCombAccuracyCOTORM/mean": 0.5786874890327454, "rewards/DrugCombAccuracyCOTORM/std": 0.4553118646144867, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.5439056158065796, "step": 1918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 445.6875, "completions/min_length": 397.0, "epoch": 2.822058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.01679745689034462, "kl": 0.004765246470924467, "learning_rate": 9.948624578993437e-07, "loss": 4.758957220474258e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/mean_length": 403.4375, "completions/min_length": 367.0, "epoch": 2.8235294117647056, "frac_reward_zero_std": 1.0, "grad_norm": 0.017643336206674576, "kl": 0.006142140016891062, "learning_rate": 9.948440919541277e-07, "loss": 6.0938931710552424e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/mean_length": 462.6875, "completions/min_length": 371.0, "epoch": 2.825, "frac_reward_zero_std": 1.0, "grad_norm": 0.02171247825026512, "kl": 0.0042755042668432, "learning_rate": 9.948256934098351e-07, "loss": 4.234614607412368e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 450.0625, "completions/min_length": 399.0, "epoch": 2.826470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.012036852538585663, "kl": 0.003964565577916801, "learning_rate": 9.948072622676782e-07, "loss": 3.970511897932738e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 485.75, "completions/min_length": 392.0, "epoch": 2.8279411764705884, "frac_reward_zero_std": 0.5, "grad_norm": 1.0673471689224243, "kl": 0.00652828614693135, "learning_rate": 9.947887985288709e-07, "loss": 6.579102773685008e-05, "reward": 0.9114583730697632, "reward_std": 0.0733194574713707, "rewards/DrugCombAccuracyCOTORM/mean": 0.8958333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.15957117080688477, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9479166269302368, "rewards/DrugCombCoverageCOTORM/std": 0.07978560030460358, "step": 1923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/mean_length": 504.6875, "completions/min_length": 414.0, "epoch": 2.8294117647058825, "frac_reward_zero_std": 0.5, "grad_norm": 0.8553456664085388, "kl": 0.005118036759085953, "learning_rate": 9.947703021946298e-07, "loss": 5.063321441411972e-05, "reward": 0.6074166297912598, "reward_std": 0.039867136627435684, "rewards/DrugCombAccuracyCOTORM/mean": 0.5274999737739563, "rewards/DrugCombAccuracyCOTORM/std": 0.49293002486228943, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.17078250646591187, "step": 1924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/mean_length": 520.3125, "completions/min_length": 449.0, "epoch": 2.8308823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 1.1099610328674316, "kl": 0.005034704110585153, "learning_rate": 9.947517732661732e-07, "loss": 5.1143862947355956e-05, "reward": 0.10625000298023224, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.0625, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.6291528940200806, "step": 1925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/mean_length": 515.75, "completions/min_length": 407.0, "epoch": 2.8323529411764707, "frac_reward_zero_std": 0.0, "grad_norm": 1.476353645324707, "kl": 0.005410105804912746, "learning_rate": 9.94733211744722e-07, "loss": 5.4508447647094727e-05, "reward": 0.6674916744232178, "reward_std": 0.2692090570926666, "rewards/DrugCombAccuracyCOTORM/mean": 0.6156145930290222, "rewards/DrugCombAccuracyCOTORM/std": 0.45401614904403687, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 1926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 432.6875, "completions/min_length": 356.0, "epoch": 2.833823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.01062566414475441, "kl": 0.004641416249796748, "learning_rate": 9.947146176314986e-07, "loss": 4.6132878196658567e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 1927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 411.375, "completions/min_length": 380.0, "epoch": 2.835294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.012035539373755455, "kl": 0.0047836501034908, "learning_rate": 9.946959909277283e-07, "loss": 4.815509601030499e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 484.5625, "completions/min_length": 425.0, "epoch": 2.836764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.815467119216919, "kl": 0.0057415246847085655, "learning_rate": 9.946773316346382e-07, "loss": 5.6684017181396484e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 484.75, "completions/min_length": 448.0, "epoch": 2.838235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1562800407409668, "kl": 0.00605656614061445, "learning_rate": 9.946586397534572e-07, "loss": 6.1011320212855935e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 1930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 480.5, "completions/min_length": 403.0, "epoch": 2.8397058823529413, "frac_reward_zero_std": 1.0, "grad_norm": 0.013696476817131042, "kl": 0.005059440736658871, "learning_rate": 9.946399152854169e-07, "loss": 5.059482646174729e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 479.9375, "completions/min_length": 423.0, "epoch": 2.8411764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.01029079407453537, "kl": 0.004471497202757746, "learning_rate": 9.946211582317506e-07, "loss": 4.4971417082706466e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 410.875, "completions/min_length": 348.0, "epoch": 2.8426470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.2327663898468018, "kl": 0.005178647406864911, "learning_rate": 9.946023685936942e-07, "loss": 5.173551107873209e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 1933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 427.4375, "completions/min_length": 391.0, "epoch": 2.8441176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.010888395830988884, "kl": 0.004468478378839791, "learning_rate": 9.945835463724855e-07, "loss": 4.465523670660332e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 470.5, "completions/min_length": 383.0, "epoch": 2.8455882352941178, "frac_reward_zero_std": 0.5, "grad_norm": 0.9365979433059692, "kl": 0.0042059330735355616, "learning_rate": 9.945646915693644e-07, "loss": 4.204786819173023e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 440.8125, "completions/min_length": 397.0, "epoch": 2.847058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.013527174480259418, "kl": 0.00535153248347342, "learning_rate": 9.94545804185573e-07, "loss": 5.2983465138822794e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 420.75, "completions/min_length": 362.0, "epoch": 2.848529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.008800734765827656, "kl": 0.004067370551638305, "learning_rate": 9.945268842223555e-07, "loss": 4.1127663280349225e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/mean_length": 403.25, "completions/min_length": 352.0, "epoch": 2.85, "frac_reward_zero_std": 0.0, "grad_norm": 1.5663374662399292, "kl": 0.005225973320193589, "learning_rate": 9.945079316809584e-07, "loss": 5.201995372772217e-05, "reward": 0.6625000238418579, "reward_std": 0.3919961452484131, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 1938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 464.3125, "completions/min_length": 403.0, "epoch": 2.851470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.012382983230054379, "kl": 0.004930272581987083, "learning_rate": 9.9448894656263e-07, "loss": 5.0297690904699266e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 472.125, "completions/min_length": 399.0, "epoch": 2.8529411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 1.4062366485595703, "kl": 0.005861594807356596, "learning_rate": 9.944699288686216e-07, "loss": 5.838274955749512e-05, "reward": 0.6945833563804626, "reward_std": 0.3460341989994049, "rewards/DrugCombAccuracyCOTORM/mean": 0.6625000238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.4047633111476898, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6458333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.3542075455188751, "step": 1940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/mean_length": 515.25, "completions/min_length": 389.0, "epoch": 2.8544117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 1.4673184156417847, "kl": 0.005481966596562415, "learning_rate": 9.944508786001854e-07, "loss": 5.5164098739624023e-05, "reward": 0.3862749934196472, "reward_std": 0.283831387758255, "rewards/DrugCombAccuracyCOTORM/mean": 0.2614895701408386, "rewards/DrugCombAccuracyCOTORM/std": 0.3584076464176178, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7708333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.291070818901062, "step": 1941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/mean_length": 494.0, "completions/min_length": 394.0, "epoch": 2.8558823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 1.3538603782653809, "kl": 0.005626243422739208, "learning_rate": 9.944317957585767e-07, "loss": 5.605444312095642e-05, "reward": 0.7063583135604858, "reward_std": 0.4062386155128479, "rewards/DrugCombAccuracyCOTORM/mean": 0.637374997138977, "rewards/DrugCombAccuracyCOTORM/std": 0.4857471287250519, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9645833373069763, "rewards/DrugCombCoverageCOTORM/std": 0.10573814809322357, "step": 1942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 466.1875, "completions/min_length": 410.0, "epoch": 2.8573529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 0.9813079833984375, "kl": 0.0056713385274633765, "learning_rate": 9.944126803450524e-07, "loss": 5.664061245624907e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 1943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/mean_length": 517.1875, "completions/min_length": 442.0, "epoch": 2.8588235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.054240573197603226, "kl": 0.00574250885983929, "learning_rate": 9.94393532360872e-07, "loss": 5.7992383517557755e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 455.1875, "completions/min_length": 392.0, "epoch": 2.860294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.0358045101165771, "kl": 0.004542249895166606, "learning_rate": 9.94374351807297e-07, "loss": 4.531058220891282e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 466.0625, "completions/min_length": 408.0, "epoch": 2.861764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0824247598648071, "kl": 0.005860683100763708, "learning_rate": 9.943551386855906e-07, "loss": 5.891551336389966e-05, "reward": 0.8577916622161865, "reward_std": 0.19648927450180054, "rewards/DrugCombAccuracyCOTORM/mean": 0.8365625143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.35157132148742676, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8854166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.24883991479873657, "step": 1946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 455.6875, "completions/min_length": 401.0, "epoch": 2.863235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1505972146987915, "kl": 0.0047579758684150875, "learning_rate": 9.943358929970187e-07, "loss": 4.762451135320589e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 472.1875, "completions/min_length": 435.0, "epoch": 2.864705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 1.604170560836792, "kl": 0.005491505493409932, "learning_rate": 9.943166147428492e-07, "loss": 5.5186450481414795e-05, "reward": 0.4801250100135803, "reward_std": 0.2223067730665207, "rewards/DrugCombAccuracyCOTORM/mean": 0.3853124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.49340811371803284, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.71875, "rewards/DrugCombCoverageCOTORM/std": 0.682367205619812, "step": 1948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/mean_length": 495.25, "completions/min_length": 415.0, "epoch": 2.8661764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.2226784229278564, "kl": 0.004261099966242909, "learning_rate": 9.942973039243522e-07, "loss": 4.239380359649658e-05, "reward": 0.78125, "reward_std": 0.3743184804916382, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 1949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 443.0625, "completions/min_length": 374.0, "epoch": 2.8676470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.043985240161418915, "kl": 0.0064945846097543836, "learning_rate": 9.942779605427997e-07, "loss": 6.484852929133922e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 465.9375, "completions/min_length": 392.0, "epoch": 2.8691176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.009979574009776115, "kl": 0.004013321653474122, "learning_rate": 9.94258584599466e-07, "loss": 4.040657222503796e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/mean_length": 527.4375, "completions/min_length": 430.0, "epoch": 2.8705882352941177, "frac_reward_zero_std": 0.0, "grad_norm": 1.2803200483322144, "kl": 0.004269034776370972, "learning_rate": 9.942391760956277e-07, "loss": 4.2498111724853516e-05, "reward": 0.8324375152587891, "reward_std": 0.30367299914360046, "rewards/DrugCombAccuracyCOTORM/mean": 0.7964062690734863, "rewards/DrugCombAccuracyCOTORM/std": 0.36838042736053467, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.953125, "rewards/DrugCombCoverageCOTORM/std": 0.10077822208404541, "step": 1952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 462.5625, "completions/min_length": 378.0, "epoch": 2.8720588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.013668502680957317, "kl": 0.004649773647543043, "learning_rate": 9.94219735032563e-07, "loss": 4.643473221221939e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 432.6875, "completions/min_length": 395.0, "epoch": 2.873529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.0073877195827662945, "kl": 0.003794059797655791, "learning_rate": 9.94200261411553e-07, "loss": 3.803366416832432e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 468.375, "completions/min_length": 395.0, "epoch": 2.875, "frac_reward_zero_std": 1.0, "grad_norm": 0.008986677043139935, "kl": 0.003903959586750716, "learning_rate": 9.941807552338803e-07, "loss": 3.940306123695336e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 427.5625, "completions/min_length": 380.0, "epoch": 2.876470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.020338933914899826, "kl": 0.0051477812230587006, "learning_rate": 9.941612165008301e-07, "loss": 5.175509431865066e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 503.875, "completions/min_length": 444.0, "epoch": 2.8779411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.011644959449768, "kl": 0.005209039431065321, "learning_rate": 9.941416452136898e-07, "loss": 5.1647424697875977e-05, "reward": 0.8999999761581421, "reward_std": 0.10690448433160782, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.22360680997371674, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 495.8125, "completions/min_length": 414.0, "epoch": 2.8794117647058823, "frac_reward_zero_std": 0.0, "grad_norm": 1.3351529836654663, "kl": 0.00452654215041548, "learning_rate": 9.941220413737482e-07, "loss": 4.540383815765381e-05, "reward": 0.574999988079071, "reward_std": 0.2121320366859436, "rewards/DrugCombAccuracyCOTORM/mean": 0.46875, "rewards/DrugCombAccuracyCOTORM/std": 0.4989572763442993, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/mean_length": 493.5, "completions/min_length": 423.0, "epoch": 2.8808823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.8001475930213928, "kl": 0.004262384900357574, "learning_rate": 9.941024049822969e-07, "loss": 4.677851393353194e-05, "reward": 0.7945312857627869, "reward_std": 0.0022097050677984953, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9453125, "rewards/DrugCombCoverageCOTORM/std": 0.06404344737529755, "step": 1959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 436.625, "completions/min_length": 362.0, "epoch": 2.8823529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.008368054404854774, "kl": 0.0033618633751757443, "learning_rate": 9.940827360406296e-07, "loss": 3.368295438122004e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 451.0625, "completions/min_length": 384.0, "epoch": 2.8838235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.011094200424849987, "kl": 0.0040811761864461005, "learning_rate": 9.940630345500421e-07, "loss": 4.1015708120539784e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 1961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 458.4375, "completions/min_length": 382.0, "epoch": 2.885294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.012786097824573517, "kl": 0.005092830746434629, "learning_rate": 9.940433005118322e-07, "loss": 5.0855633162427694e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/mean_length": 437.4375, "completions/min_length": 288.0, "epoch": 2.886764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.742545247077942, "kl": 0.006860170746222138, "learning_rate": 9.940235339273e-07, "loss": 6.768107414245605e-05, "reward": 0.6037083268165588, "reward_std": 0.2067062109708786, "rewards/DrugCombAccuracyCOTORM/mean": 0.5371875166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.42861178517341614, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7395833730697632, "rewards/DrugCombCoverageCOTORM/std": 0.4947122633457184, "step": 1963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/mean_length": 418.5, "completions/min_length": 381.0, "epoch": 2.888235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.014419418759644032, "kl": 0.004420624754857272, "learning_rate": 9.940037347977474e-07, "loss": 4.440839256858453e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 466.1875, "completions/min_length": 397.0, "epoch": 2.889705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.018057843670248985, "kl": 0.0046423645108006895, "learning_rate": 9.939839031244792e-07, "loss": 4.594161509885453e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 473.1875, "completions/min_length": 381.0, "epoch": 2.8911764705882352, "frac_reward_zero_std": 0.5, "grad_norm": 0.6722187399864197, "kl": 0.00469028705265373, "learning_rate": 9.939640389088013e-07, "loss": 4.699826240539551e-05, "reward": 0.8760416507720947, "reward_std": 0.050086721777915955, "rewards/DrugCombAccuracyCOTORM/mean": 0.8541666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.17078250646591187, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9270833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.08539126813411713, "step": 1966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 471.0, "completions/min_length": 397.0, "epoch": 2.8926470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 0.8887789249420166, "kl": 0.005928290192969143, "learning_rate": 9.939441421520229e-07, "loss": 5.876924842596054e-05, "reward": 0.887499988079071, "reward_std": 0.21001701056957245, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 1967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/mean_length": 485.0625, "completions/min_length": 428.0, "epoch": 2.8941176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9697694778442383, "kl": 0.004564347735140473, "learning_rate": 9.939242128554542e-07, "loss": 4.576303399517201e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 1968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 466.625, "completions/min_length": 358.0, "epoch": 2.8955882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.014743000268936157, "kl": 0.004312243312597275, "learning_rate": 9.939042510204084e-07, "loss": 4.3121122871525586e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 456.4375, "completions/min_length": 388.0, "epoch": 2.8970588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.9922696948051453, "kl": 0.004624702734872699, "learning_rate": 9.938842566482004e-07, "loss": 4.628526949090883e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 709.0, "completions/mean_length": 517.3125, "completions/min_length": 392.0, "epoch": 2.898529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8564669489860535, "kl": 0.004697819298598915, "learning_rate": 9.938642297401474e-07, "loss": 4.717152478406206e-05, "reward": 0.6300337314605713, "reward_std": 0.03464758023619652, "rewards/DrugCombAccuracyCOTORM/mean": 0.5664483904838562, "rewards/DrugCombAccuracyCOTORM/std": 0.4513285756111145, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.768750011920929, "rewards/DrugCombCoverageCOTORM/std": 0.2414366751909256, "step": 1971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 420.25, "completions/min_length": 380.0, "epoch": 2.9, "frac_reward_zero_std": 1.0, "grad_norm": 0.00957777164876461, "kl": 0.004092966089956462, "learning_rate": 9.938441702975689e-07, "loss": 4.0936276491265744e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 449.4375, "completions/min_length": 385.0, "epoch": 2.901470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.0479469299316406, "kl": 0.005082028219476342, "learning_rate": 9.938240783217861e-07, "loss": 5.026116923545487e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 429.75, "completions/min_length": 397.0, "epoch": 2.902941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.013384760357439518, "kl": 0.003587277082260698, "learning_rate": 9.938039538141227e-07, "loss": 3.586455568438396e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 445.8125, "completions/min_length": 394.0, "epoch": 2.9044117647058822, "frac_reward_zero_std": 0.5, "grad_norm": 1.0268100500106812, "kl": 0.005428237491287291, "learning_rate": 9.937837967759046e-07, "loss": 5.459785461425781e-05, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 1975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 479.375, "completions/min_length": 446.0, "epoch": 2.9058823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.005627989768982, "kl": 0.004541079513728619, "learning_rate": 9.937636072084596e-07, "loss": 4.528200224740431e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 505.25, "completions/min_length": 456.0, "epoch": 2.9073529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.9268882274627686, "kl": 0.004224005911964923, "learning_rate": 9.937433851131177e-07, "loss": 4.220083064865321e-05, "reward": 0.45000001788139343, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 1977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/mean_length": 419.5, "completions/min_length": 373.0, "epoch": 2.9088235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 0.9065836668014526, "kl": 0.0041559189558029175, "learning_rate": 9.93723130491211e-07, "loss": 4.1334238630952314e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 1978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 784.0, "completions/mean_length": 610.6875, "completions/min_length": 545.0, "epoch": 2.9102941176470587, "frac_reward_zero_std": 0.0, "grad_norm": 1.3214365243911743, "kl": 0.008022110967431217, "learning_rate": 9.93702843344074e-07, "loss": 8.338689804077148e-05, "reward": 0.47137898206710815, "reward_std": 0.1483297199010849, "rewards/DrugCombAccuracyCOTORM/mean": 0.3839285969734192, "rewards/DrugCombAccuracyCOTORM/std": 0.4509061276912689, "rewards/DrugCombCOTFormatORM/mean": 0.9375, "rewards/DrugCombCOTFormatORM/std": 0.17078252136707306, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6736111044883728, "rewards/DrugCombCoverageCOTORM/std": 0.42400938272476196, "step": 1979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 418.8125, "completions/min_length": 376.0, "epoch": 2.911764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.2099459171295166, "kl": 0.0058438003761693835, "learning_rate": 9.93682523673043e-07, "loss": 6.066262722015381e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 1980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 443.375, "completions/min_length": 365.0, "epoch": 2.913235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9763131141662598, "kl": 0.004439045034814626, "learning_rate": 9.936621714794569e-07, "loss": 4.388724482851103e-05, "reward": 0.7565000057220459, "reward_std": 0.0910470113158226, "rewards/DrugCombAccuracyCOTORM/mean": 0.721666693687439, "rewards/DrugCombAccuracyCOTORM/std": 0.3305080235004425, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.22360680997371674, "step": 1981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 454.3125, "completions/min_length": 418.0, "epoch": 2.914705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.011991435661911964, "kl": 0.0037561000790446997, "learning_rate": 9.93641786764656e-07, "loss": 3.7590401916531846e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 439.8125, "completions/min_length": 354.0, "epoch": 2.916176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.008172511123120785, "kl": 0.0037659399094991386, "learning_rate": 9.936213695299834e-07, "loss": 3.780886618187651e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 461.75, "completions/min_length": 393.0, "epoch": 2.9176470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 1.3793208599090576, "kl": 0.005996347754262388, "learning_rate": 9.936009197767844e-07, "loss": 5.9757381677627563e-05, "reward": 0.8374999761581421, "reward_std": 0.22638463973999023, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 1984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 484.8125, "completions/min_length": 430.0, "epoch": 2.9191176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.012462775222957134, "kl": 0.0046227898565120995, "learning_rate": 9.935804375064058e-07, "loss": 4.6189459681045264e-05, "reward": 0.6410000324249268, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5824999809265137, "rewards/DrugCombAccuracyCOTORM/std": 0.43119215965270996, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 1985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 468.875, "completions/min_length": 407.0, "epoch": 2.9205882352941175, "frac_reward_zero_std": 0.0, "grad_norm": 1.6987369060516357, "kl": 0.004605366033501923, "learning_rate": 9.935599227201974e-07, "loss": 4.6037137508392334e-05, "reward": 0.8374999761581421, "reward_std": 0.34973084926605225, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 1986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 476.0, "completions/min_length": 424.0, "epoch": 2.9220588235294116, "frac_reward_zero_std": 0.0, "grad_norm": 1.274595022201538, "kl": 0.004817839595489204, "learning_rate": 9.935393754195101e-07, "loss": 4.797428846359253e-05, "reward": 0.7458333373069763, "reward_std": 0.1532064527273178, "rewards/DrugCombAccuracyCOTORM/mean": 0.6979166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.30561867356300354, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 1987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 447.1875, "completions/min_length": 391.0, "epoch": 2.923529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.1639963388442993, "kl": 0.0041027365368790925, "learning_rate": 9.935187956056976e-07, "loss": 4.06801700592041e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 462.875, "completions/min_length": 415.0, "epoch": 2.925, "frac_reward_zero_std": 0.0, "grad_norm": 1.251395344734192, "kl": 0.00520517339464277, "learning_rate": 9.934981832801159e-07, "loss": 5.182996392250061e-05, "reward": 0.8374999761581421, "reward_std": 0.35143834352493286, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 1989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/mean_length": 520.25, "completions/min_length": 455.0, "epoch": 2.9264705882352944, "frac_reward_zero_std": 0.5, "grad_norm": 0.984291672706604, "kl": 0.006471287575550377, "learning_rate": 9.934775384441228e-07, "loss": 6.446192128350958e-05, "reward": 0.8942611217498779, "reward_std": 0.10291367024183273, "rewards/DrugCombAccuracyCOTORM/mean": 0.8930000066757202, "rewards/DrugCombAccuracyCOTORM/std": 0.19303886592388153, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7986111044883728, "rewards/DrugCombCoverageCOTORM/std": 0.4989185929298401, "step": 1990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 435.5, "completions/min_length": 388.0, "epoch": 2.927941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.014586110599339008, "kl": 0.005121582536958158, "learning_rate": 9.934568610990783e-07, "loss": 5.134646926308051e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 397.1875, "completions/min_length": 310.0, "epoch": 2.9294117647058826, "frac_reward_zero_std": 1.0, "grad_norm": 0.019683193415403366, "kl": 0.005132211837917566, "learning_rate": 9.934361512463445e-07, "loss": 5.159056308912113e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 1992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 429.25, "completions/min_length": 381.0, "epoch": 2.9308823529411763, "frac_reward_zero_std": 0.5, "grad_norm": 1.0534669160842896, "kl": 0.005399376270361245, "learning_rate": 9.93415408887286e-07, "loss": 5.37675223313272e-05, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 1993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/mean_length": 466.75, "completions/min_length": 375.0, "epoch": 2.932352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.015781648457050323, "kl": 0.004589969641529024, "learning_rate": 9.933946340232687e-07, "loss": 4.60764167655725e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 440.0, "completions/min_length": 383.0, "epoch": 2.9338235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 0.9971829056739807, "kl": 0.005224034306593239, "learning_rate": 9.933738266556618e-07, "loss": 5.188584327697754e-05, "reward": 0.375, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 1995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 437.0, "completions/min_length": 393.0, "epoch": 2.935294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.016399916261434555, "kl": 0.003976555599365383, "learning_rate": 9.933529867858357e-07, "loss": 3.967207521782257e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 424.375, "completions/min_length": 374.0, "epoch": 2.9367647058823527, "frac_reward_zero_std": 1.0, "grad_norm": 0.012354512698948383, "kl": 0.0046601672656834126, "learning_rate": 9.933321144151634e-07, "loss": 4.671981878345832e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 433.1875, "completions/min_length": 370.0, "epoch": 2.9382352941176473, "frac_reward_zero_std": 1.0, "grad_norm": 0.010127152316272259, "kl": 0.004769806517288089, "learning_rate": 9.933112095450197e-07, "loss": 4.794850246980786e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 449.3125, "completions/min_length": 363.0, "epoch": 2.939705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.01054147258400917, "kl": 0.004283655202016234, "learning_rate": 9.93290272176782e-07, "loss": 4.277023617760278e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 1999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 453.0625, "completions/min_length": 385.0, "epoch": 2.9411764705882355, "frac_reward_zero_std": 0.5, "grad_norm": 0.8363609313964844, "kl": 0.0033375077764503658, "learning_rate": 9.932693023118298e-07, "loss": 3.343820571899414e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 450.125, "completions/min_length": 379.0, "epoch": 2.942647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.4265676736831665, "kl": 0.006356253405101597, "learning_rate": 9.93248299951544e-07, "loss": 6.432340887840837e-05, "reward": 0.824999988079071, "reward_std": 0.24348658323287964, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 2001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 466.125, "completions/min_length": 396.0, "epoch": 2.9441176470588237, "frac_reward_zero_std": 0.5, "grad_norm": 1.0930219888687134, "kl": 0.006031695171259344, "learning_rate": 9.932272650973084e-07, "loss": 6.0930848121643066e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 427.0625, "completions/min_length": 321.0, "epoch": 2.9455882352941174, "frac_reward_zero_std": 1.0, "grad_norm": 0.007973715662956238, "kl": 0.0042368569993413985, "learning_rate": 9.932061977505089e-07, "loss": 4.290851211408153e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 447.5625, "completions/min_length": 403.0, "epoch": 2.947058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.1087195873260498, "kl": 0.004674428724683821, "learning_rate": 9.931850979125333e-07, "loss": 4.6819448471069336e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 2004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 486.25, "completions/min_length": 404.0, "epoch": 2.9485294117647056, "frac_reward_zero_std": 0.5, "grad_norm": 1.04661226272583, "kl": 0.007165202056057751, "learning_rate": 9.931639655847713e-07, "loss": 7.19316303730011e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 431.3125, "completions/min_length": 373.0, "epoch": 2.95, "frac_reward_zero_std": 0.5, "grad_norm": 1.463134527206421, "kl": 0.005044281773734838, "learning_rate": 9.931428007686156e-07, "loss": 5.019863601773977e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/mean_length": 407.625, "completions/min_length": 364.0, "epoch": 2.951470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9614917635917664, "kl": 0.006223425501957536, "learning_rate": 9.9312160346546e-07, "loss": 6.214529275894165e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 471.0625, "completions/min_length": 386.0, "epoch": 2.9529411764705884, "frac_reward_zero_std": 1.0, "grad_norm": 0.013627961277961731, "kl": 0.0047242059372365475, "learning_rate": 9.931003736767012e-07, "loss": 4.776380956172943e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 504.6875, "completions/min_length": 455.0, "epoch": 2.9544117647058825, "frac_reward_zero_std": 0.5, "grad_norm": 0.856394350528717, "kl": 0.005817465775180608, "learning_rate": 9.930791114037376e-07, "loss": 5.819733996759169e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 2009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 421.4375, "completions/min_length": 361.0, "epoch": 2.9558823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 1.355065107345581, "kl": 0.005007506289985031, "learning_rate": 9.9305781664797e-07, "loss": 5.042552947998047e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 479.5, "completions/min_length": 404.0, "epoch": 2.9573529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.04684179276227951, "kl": 0.006132341630291194, "learning_rate": 9.930364894108012e-07, "loss": 6.19838829152286e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/mean_length": 547.125, "completions/min_length": 478.0, "epoch": 2.958823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 1.486240029335022, "kl": 0.005459336214698851, "learning_rate": 9.930151296936361e-07, "loss": 5.4448843002319336e-05, "reward": 0.5459461212158203, "reward_std": 0.2887914180755615, "rewards/DrugCombAccuracyCOTORM/mean": 0.47774508595466614, "rewards/DrugCombAccuracyCOTORM/std": 0.333390474319458, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.637499988079071, "rewards/DrugCombCoverageCOTORM/std": 0.6551082134246826, "step": 2012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 463.25, "completions/min_length": 392.0, "epoch": 2.960294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.3795640468597412, "kl": 0.004374161129817367, "learning_rate": 9.92993737497882e-07, "loss": 4.392862319946289e-05, "reward": 0.78125, "reward_std": 0.3743184804916382, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 2013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 435.625, "completions/min_length": 355.0, "epoch": 2.961764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.009630544111132622, "kl": 0.003889394283760339, "learning_rate": 9.929723128249483e-07, "loss": 3.910361192538403e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 848.0, "completions/mean_length": 562.9375, "completions/min_length": 407.0, "epoch": 2.963235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.720918595790863, "kl": 0.004830090911127627, "learning_rate": 9.929508556762459e-07, "loss": 4.808604717254639e-05, "reward": 0.7145595550537109, "reward_std": 0.22239819169044495, "rewards/DrugCombAccuracyCOTORM/mean": 0.6920833587646484, "rewards/DrugCombAccuracyCOTORM/std": 0.4448152184486389, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6089285612106323, "rewards/DrugCombCoverageCOTORM/std": 0.8007968664169312, "step": 2015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/mean_length": 487.3125, "completions/min_length": 406.0, "epoch": 2.9647058823529413, "frac_reward_zero_std": 0.5, "grad_norm": 1.1835777759552002, "kl": 0.005697021202649921, "learning_rate": 9.929293660531887e-07, "loss": 5.71664422750473e-05, "reward": 0.6520000100135803, "reward_std": 0.22060762345790863, "rewards/DrugCombAccuracyCOTORM/mean": 0.6353124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.48780280351638794, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.8732125163078308, "step": 2016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 410.9375, "completions/min_length": 362.0, "epoch": 2.9661764705882354, "frac_reward_zero_std": 0.0, "grad_norm": 1.3615484237670898, "kl": 0.003973360522650182, "learning_rate": 9.929078439571924e-07, "loss": 3.9480626583099365e-05, "reward": 0.8600208163261414, "reward_std": 0.31566154956817627, "rewards/DrugCombAccuracyCOTORM/mean": 0.8400000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.3471022844314575, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.26440009474754333, "step": 2017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 398.25, "completions/min_length": 346.0, "epoch": 2.9676470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.007318637799471617, "kl": 0.004470848711207509, "learning_rate": 9.928862893896745e-07, "loss": 4.492751759244129e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 459.6875, "completions/min_length": 424.0, "epoch": 2.9691176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.01622658036649227, "kl": 0.004982409183867276, "learning_rate": 9.928647023520554e-07, "loss": 4.947656998410821e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/mean_length": 515.75, "completions/min_length": 422.0, "epoch": 2.9705882352941178, "frac_reward_zero_std": 0.5, "grad_norm": 0.9036840796470642, "kl": 0.005418236716650426, "learning_rate": 9.92843082845757e-07, "loss": 5.428493022918701e-05, "reward": 0.7775000333786011, "reward_std": 0.22050268948078156, "rewards/DrugCombAccuracyCOTORM/mean": 0.737500011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.4425306022167206, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/mean_length": 500.625, "completions/min_length": 451.0, "epoch": 2.972058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.2006757259368896, "kl": 0.00495201424928382, "learning_rate": 9.928214308722034e-07, "loss": 5.0611793994903564e-05, "reward": 0.8388333320617676, "reward_std": 0.3540423810482025, "rewards/DrugCombAccuracyCOTORM/mean": 0.8193750381469727, "rewards/DrugCombAccuracyCOTORM/std": 0.38902390003204346, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.3651483952999115, "step": 2021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 457.75, "completions/min_length": 413.0, "epoch": 2.973529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.0509988069534302, "kl": 0.00510994833894074, "learning_rate": 9.927997464328213e-07, "loss": 5.085299562779255e-05, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 2022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 467.25, "completions/min_length": 380.0, "epoch": 2.975, "frac_reward_zero_std": 0.5, "grad_norm": 0.9417516589164734, "kl": 0.004220557049848139, "learning_rate": 9.927780295290389e-07, "loss": 4.2007282900158316e-05, "reward": 0.9573999643325806, "reward_std": 0.12049099802970886, "rewards/DrugCombAccuracyCOTORM/mean": 0.949874997138977, "rewards/DrugCombAccuracyCOTORM/std": 0.2004999965429306, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9750000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.10000000149011612, "step": 2023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/mean_length": 502.0, "completions/min_length": 387.0, "epoch": 2.976470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.1092121601104736, "kl": 0.005880645418073982, "learning_rate": 9.92756280162287e-07, "loss": 5.9194862842559814e-05, "reward": 0.40000003576278687, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 2024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 450.9375, "completions/min_length": 393.0, "epoch": 2.9779411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.4545766115188599, "kl": 0.012388923205435276, "learning_rate": 9.927344983339983e-07, "loss": 0.00012794607027899474, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 2025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/mean_length": 501.75, "completions/min_length": 369.0, "epoch": 2.9794117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 1.9798593521118164, "kl": 0.005622160620987415, "learning_rate": 9.92712684045608e-07, "loss": 5.669146776199341e-05, "reward": 0.4375, "reward_std": 0.46579423546791077, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 466.0, "completions/min_length": 403.0, "epoch": 2.9808823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.035301025956869125, "kl": 0.005186673428397626, "learning_rate": 9.926908372985528e-07, "loss": 5.1284001528983936e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 478.25, "completions/min_length": 415.0, "epoch": 2.9823529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 1.3794729709625244, "kl": 0.007546060485765338, "learning_rate": 9.92668958094272e-07, "loss": 7.552255556220189e-05, "reward": 0.6032500267028809, "reward_std": 0.05394463613629341, "rewards/DrugCombAccuracyCOTORM/mean": 0.5274999737739563, "rewards/DrugCombAccuracyCOTORM/std": 0.49293002486228943, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5013870000839233, "step": 2028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 448.0, "completions/min_length": 407.0, "epoch": 2.9838235294117648, "frac_reward_zero_std": 0.5, "grad_norm": 1.023440957069397, "kl": 0.005771462805569172, "learning_rate": 9.92647046434207e-07, "loss": 5.828130088048056e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 411.875, "completions/min_length": 348.0, "epoch": 2.985294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.02209956757724285, "kl": 0.005733694648370147, "learning_rate": 9.926251023198018e-07, "loss": 5.6864435464376584e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/mean_length": 505.25, "completions/min_length": 452.0, "epoch": 2.986764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0181171894073486, "kl": 0.010211526183411479, "learning_rate": 9.926031257525011e-07, "loss": 0.00010197071969741955, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/mean_length": 488.6875, "completions/min_length": 375.0, "epoch": 2.988235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.6710708141326904, "kl": 0.006837090011686087, "learning_rate": 9.925811167337532e-07, "loss": 6.901845335960388e-05, "reward": 0.5545833706855774, "reward_std": 0.39092108607292175, "rewards/DrugCombAccuracyCOTORM/mean": 0.47559523582458496, "rewards/DrugCombAccuracyCOTORM/std": 0.447451114654541, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7410714626312256, "rewards/DrugCombCoverageCOTORM/std": 0.5063374042510986, "step": 2032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 501.0, "completions/min_length": 437.0, "epoch": 2.989705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.01011495478451252, "kl": 0.005430767894722521, "learning_rate": 9.92559075265008e-07, "loss": 5.4136769904289395e-05, "reward": 0.7666666507720947, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.3442651927471161, "step": 2033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/mean_length": 465.875, "completions/min_length": 382.0, "epoch": 2.9911764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.1193181276321411, "kl": 0.005095272499602288, "learning_rate": 9.925370013477173e-07, "loss": 5.036592483520508e-05, "reward": 0.8296874761581421, "reward_std": 0.23508523404598236, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 2034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 436.75, "completions/min_length": 386.0, "epoch": 2.9926470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.007758244872093201, "kl": 0.004135436960496008, "learning_rate": 9.925148949833354e-07, "loss": 4.122468817513436e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 454.0, "completions/min_length": 431.0, "epoch": 2.9941176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.1272691488265991, "kl": 0.004851705045439303, "learning_rate": 9.924927561733188e-07, "loss": 4.820844333153218e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 466.25, "completions/min_length": 384.0, "epoch": 2.9955882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.07936030626296997, "kl": 0.007817088393494487, "learning_rate": 9.924705849191255e-07, "loss": 7.810216629877687e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 454.0, "completions/min_length": 399.0, "epoch": 2.9970588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.1684317588806152, "kl": 0.004899419436696917, "learning_rate": 9.924483812222164e-07, "loss": 4.8839265218703076e-05, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 2038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 477.8125, "completions/min_length": 386.0, "epoch": 2.998529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9416069388389587, "kl": 0.005696091102436185, "learning_rate": 9.924261450840544e-07, "loss": 5.684420466423035e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 2039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 428.0, "completions/min_length": 366.0, "epoch": 3.0, "frac_reward_zero_std": 0.5, "grad_norm": 1.0705420970916748, "kl": 0.00599867437267676, "learning_rate": 9.92403876506104e-07, "loss": 6.166543607832864e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 482.4375, "completions/min_length": 412.0, "epoch": 3.001470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9652594327926636, "kl": 0.005925621022470295, "learning_rate": 9.923815754898324e-07, "loss": 5.9641897678375244e-05, "reward": 0.5874999761581421, "reward_std": 0.0353553369641304, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 458.5625, "completions/min_length": 396.0, "epoch": 3.0029411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9489161968231201, "kl": 0.0048534690286032856, "learning_rate": 9.923592420367085e-07, "loss": 4.905046080239117e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 480.375, "completions/min_length": 389.0, "epoch": 3.0044117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.8262234330177307, "kl": 0.004389003326650709, "learning_rate": 9.923368761482038e-07, "loss": 4.380941390991211e-05, "reward": 0.7534999847412109, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.7074999809265137, "rewards/DrugCombAccuracyCOTORM/std": 0.39000001549720764, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.1666666567325592, "step": 2043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/mean_length": 466.3125, "completions/min_length": 386.0, "epoch": 3.0058823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 1.3950936794281006, "kl": 0.007277038064785302, "learning_rate": 9.923144778257916e-07, "loss": 7.337331771850586e-05, "reward": 0.8551666736602783, "reward_std": 0.2899210453033447, "rewards/DrugCombAccuracyCOTORM/mean": 0.8241666555404663, "rewards/DrugCombAccuracyCOTORM/std": 0.3450893759727478, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 2044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 451.5, "completions/min_length": 413.0, "epoch": 3.0073529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.007799653802067041, "kl": 0.0035938189830631018, "learning_rate": 9.922920470709478e-07, "loss": 3.5890916478820145e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 437.875, "completions/min_length": 398.0, "epoch": 3.0088235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0518600940704346, "kl": 0.006134602706879377, "learning_rate": 9.922695838851495e-07, "loss": 6.21162325842306e-05, "reward": 0.5625, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 2046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 485.5625, "completions/min_length": 419.0, "epoch": 3.010294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.0846208333969116, "kl": 0.005063097749371082, "learning_rate": 9.92247088269877e-07, "loss": 4.99510279041715e-05, "reward": 0.949999988079071, "reward_std": 0.0690065398812294, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.13437095284461975, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 421.75, "completions/min_length": 371.0, "epoch": 3.011764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0186684131622314, "kl": 0.0056595648638904095, "learning_rate": 9.922245602266118e-07, "loss": 5.6859105825424194e-05, "reward": 0.7171875238418579, "reward_std": 0.23422911763191223, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 2048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 434.3125, "completions/min_length": 321.0, "epoch": 3.013235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.009260265156626701, "kl": 0.00421937188366428, "learning_rate": 9.922019997568383e-07, "loss": 4.287583215045743e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/mean_length": 399.4375, "completions/min_length": 349.0, "epoch": 3.014705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.029967699199914932, "kl": 0.006338692212011665, "learning_rate": 9.921794068620428e-07, "loss": 6.316867074929178e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 472.25, "completions/min_length": 386.0, "epoch": 3.0161764705882352, "frac_reward_zero_std": 0.5, "grad_norm": 1.144095540046692, "kl": 0.00566518004052341, "learning_rate": 9.921567815437134e-07, "loss": 5.637854337692261e-05, "reward": 0.9750000238418579, "reward_std": 0.0707106739282608, "rewards/DrugCombAccuracyCOTORM/mean": 0.96875, "rewards/DrugCombAccuracyCOTORM/std": 0.125, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 422.1875, "completions/min_length": 386.0, "epoch": 3.0176470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 0.9083280563354492, "kl": 0.004559155902825296, "learning_rate": 9.921341238033407e-07, "loss": 4.547834396362305e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 423.0, "completions/min_length": 379.0, "epoch": 3.0191176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.00912066362798214, "kl": 0.004264551622327417, "learning_rate": 9.921114336424174e-07, "loss": 4.2812003812287e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 452.8125, "completions/min_length": 378.0, "epoch": 3.0205882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.1177589893341064, "kl": 0.005398215027526021, "learning_rate": 9.920887110624382e-07, "loss": 5.380809307098389e-05, "reward": 0.9666666984558105, "reward_std": 0.061721328645944595, "rewards/DrugCombAccuracyCOTORM/mean": 0.9583333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.11385500431060791, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 471.875, "completions/min_length": 422.0, "epoch": 3.0220588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.010729407891631126, "kl": 0.0038678221171721816, "learning_rate": 9.920659560649003e-07, "loss": 3.8595164369326085e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 2055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 516.6875, "completions/min_length": 437.0, "epoch": 3.023529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.366285800933838, "kl": 0.0043020209996029735, "learning_rate": 9.920431686513021e-07, "loss": 4.298985004425049e-05, "reward": 0.36250001192092896, "reward_std": 0.2906944453716278, "rewards/DrugCombAccuracyCOTORM/mean": 0.21875, "rewards/DrugCombAccuracyCOTORM/std": 0.4069705307483673, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 487.125, "completions/min_length": 429.0, "epoch": 3.025, "frac_reward_zero_std": 0.0, "grad_norm": 1.2085607051849365, "kl": 0.004066123743541539, "learning_rate": 9.920203488231452e-07, "loss": 4.0650367736816406e-05, "reward": 0.7875000238418579, "reward_std": 0.3054315745830536, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.3651483952999115, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.22360680997371674, "step": 2057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/mean_length": 418.25, "completions/min_length": 371.0, "epoch": 3.026470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.013006573542952538, "kl": 0.004563199123367667, "learning_rate": 9.91997496581933e-07, "loss": 4.564518530969508e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 473.3125, "completions/min_length": 385.0, "epoch": 3.027941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.007201783824712038, "kl": 0.0037495220894925296, "learning_rate": 9.91974611929171e-07, "loss": 3.76155148842372e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 464.125, "completions/min_length": 392.0, "epoch": 3.0294117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.010957245714962482, "kl": 0.004030636162497103, "learning_rate": 9.919516948663664e-07, "loss": 4.0701768739381805e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 438.9375, "completions/min_length": 380.0, "epoch": 3.0308823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.1376954317092896, "kl": 0.008058685110881925, "learning_rate": 9.919287453950292e-07, "loss": 8.514492947142571e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 482.0625, "completions/min_length": 415.0, "epoch": 3.0323529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.008564473129808903, "kl": 0.0036480281851254404, "learning_rate": 9.91905763516671e-07, "loss": 3.6577148421201855e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 2062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/mean_length": 541.3125, "completions/min_length": 454.0, "epoch": 3.0338235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 0.7782381772994995, "kl": 0.005639617098495364, "learning_rate": 9.91882749232806e-07, "loss": 5.589680222328752e-05, "reward": 0.5979166626930237, "reward_std": 0.005892557092010975, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 2063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 455.0, "completions/min_length": 416.0, "epoch": 3.0352941176470587, "frac_reward_zero_std": 0.5, "grad_norm": 2.7146899700164795, "kl": 0.005863275146111846, "learning_rate": 9.918597025449504e-07, "loss": 5.9001147747039795e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 411.375, "completions/min_length": 358.0, "epoch": 3.036764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.2016189098358154, "kl": 0.005435802275314927, "learning_rate": 9.918366234546222e-07, "loss": 5.3965472034178674e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 423.0625, "completions/min_length": 383.0, "epoch": 3.038235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9258694052696228, "kl": 0.004773356951773167, "learning_rate": 9.91813511963342e-07, "loss": 4.703551530838013e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 445.0, "completions/min_length": 360.0, "epoch": 3.039705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.006381053943186998, "kl": 0.0036690905108116567, "learning_rate": 9.917903680726322e-07, "loss": 3.710826422320679e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 453.0, "completions/min_length": 374.0, "epoch": 3.041176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.0519744157791138, "kl": 0.005645700148306787, "learning_rate": 9.917671917840176e-07, "loss": 5.733966827392578e-05, "reward": 0.84375, "reward_std": 0.21619683504104614, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 436.4375, "completions/min_length": 358.0, "epoch": 3.0426470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.01567646488547325, "kl": 0.006133355782367289, "learning_rate": 9.917439830990249e-07, "loss": 6.17580590187572e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 435.875, "completions/min_length": 372.0, "epoch": 3.0441176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.012413766235113144, "kl": 0.004464669211301953, "learning_rate": 9.91720742019183e-07, "loss": 4.483553129830398e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 444.8125, "completions/min_length": 386.0, "epoch": 3.0455882352941175, "frac_reward_zero_std": 1.0, "grad_norm": 0.013483600690960884, "kl": 0.004609032068401575, "learning_rate": 9.916974685460232e-07, "loss": 4.5982560550328344e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 484.6875, "completions/min_length": 412.0, "epoch": 3.0470588235294116, "frac_reward_zero_std": 0.0, "grad_norm": 1.540472388267517, "kl": 0.006957276724278927, "learning_rate": 9.916741626810783e-07, "loss": 7.0229172706604e-05, "reward": 0.9270833730697632, "reward_std": 0.2062394767999649, "rewards/DrugCombAccuracyCOTORM/mean": 0.9166666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 444.4375, "completions/min_length": 378.0, "epoch": 3.0485294117647057, "frac_reward_zero_std": 0.5, "grad_norm": 1.0658830404281616, "kl": 0.004239457892253995, "learning_rate": 9.916508244258838e-07, "loss": 4.252046346664429e-05, "reward": 0.843250036239624, "reward_std": 0.16757279634475708, "rewards/DrugCombAccuracyCOTORM/mean": 0.8118749856948853, "rewards/DrugCombAccuracyCOTORM/std": 0.3365282416343689, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.11180340498685837, "step": 2073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/mean_length": 527.875, "completions/min_length": 407.0, "epoch": 3.05, "frac_reward_zero_std": 0.0, "grad_norm": 1.285975456237793, "kl": 0.005351451109163463, "learning_rate": 9.916274537819773e-07, "loss": 5.3122639656066895e-05, "reward": 0.5768985748291016, "reward_std": 0.210051029920578, "rewards/DrugCombAccuracyCOTORM/mean": 0.5038055777549744, "rewards/DrugCombAccuracyCOTORM/std": 0.459309846162796, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7541666626930237, "rewards/DrugCombCoverageCOTORM/std": 0.2793842554092407, "step": 2074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 492.8125, "completions/min_length": 421.0, "epoch": 3.051470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8611390590667725, "kl": 0.0052704858826473355, "learning_rate": 9.916040507508983e-07, "loss": 5.250424146652222e-05, "reward": 0.7044166326522827, "reward_std": 0.1958489567041397, "rewards/DrugCombAccuracyCOTORM/mean": 0.6800000071525574, "rewards/DrugCombAccuracyCOTORM/std": 0.4316789209842682, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6041666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.71200031042099, "step": 2075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/mean_length": 413.8125, "completions/min_length": 368.0, "epoch": 3.052941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.008155949413776398, "kl": 0.003863214049488306, "learning_rate": 9.915806153341884e-07, "loss": 3.861900404444896e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 447.0, "completions/min_length": 407.0, "epoch": 3.054411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.01500769518315792, "kl": 0.005205988069064915, "learning_rate": 9.915571475333917e-07, "loss": 5.182786480872892e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/mean_length": 524.8125, "completions/min_length": 432.0, "epoch": 3.0558823529411763, "frac_reward_zero_std": 0.5, "grad_norm": 60.621395111083984, "kl": 0.4407320001046173, "learning_rate": 9.91533647350054e-07, "loss": 0.004788435995578766, "reward": 0.5051249861717224, "reward_std": 0.12692566215991974, "rewards/DrugCombAccuracyCOTORM/mean": 0.4478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.5045558214187622, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.46875, "rewards/DrugCombCoverageCOTORM/std": 0.4989572763442993, "step": 2078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 502.625, "completions/min_length": 474.0, "epoch": 3.0573529411764704, "frac_reward_zero_std": 0.5, "grad_norm": 1.118848443031311, "kl": 0.005861347075551748, "learning_rate": 9.915101147857236e-07, "loss": 5.80801788601093e-05, "reward": 0.9026666879653931, "reward_std": 0.18022631108760834, "rewards/DrugCombAccuracyCOTORM/mean": 0.8887500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.3039928674697876, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.2277100384235382, "step": 2079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 437.75, "completions/min_length": 398.0, "epoch": 3.0588235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.02123923972249031, "kl": 0.005086358287371695, "learning_rate": 9.91486549841951e-07, "loss": 5.080592382000759e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 455.0, "completions/min_length": 400.0, "epoch": 3.0602941176470586, "frac_reward_zero_std": 1.0, "grad_norm": 0.02078857272863388, "kl": 0.005594509246293455, "learning_rate": 9.914629525202877e-07, "loss": 5.6699966080486774e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/mean_length": 484.875, "completions/min_length": 423.0, "epoch": 3.0617647058823527, "frac_reward_zero_std": 0.5, "grad_norm": 1.2646045684814453, "kl": 0.006829967023804784, "learning_rate": 9.914393228222893e-07, "loss": 6.92605972290039e-05, "reward": 0.643750011920929, "reward_std": 0.14500615000724792, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 424.625, "completions/min_length": 358.0, "epoch": 3.0632352941176473, "frac_reward_zero_std": 1.0, "grad_norm": 0.009657595306634903, "kl": 0.004730729968287051, "learning_rate": 9.91415660749512e-07, "loss": 4.72145875392016e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 478.0625, "completions/min_length": 389.0, "epoch": 3.0647058823529414, "frac_reward_zero_std": 1.0, "grad_norm": 0.12393105030059814, "kl": 0.00629333226243034, "learning_rate": 9.913919663035142e-07, "loss": 6.29650749033317e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 460.6875, "completions/min_length": 408.0, "epoch": 3.0661764705882355, "frac_reward_zero_std": 0.0, "grad_norm": 1.5796546936035156, "kl": 0.0057447998551651835, "learning_rate": 9.913682394858575e-07, "loss": 5.762651562690735e-05, "reward": 0.7156250476837158, "reward_std": 0.4030206799507141, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 0.9375, "rewards/DrugCombCOTFormatORM/std": 0.17078252136707306, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 2085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 428.5625, "completions/min_length": 393.0, "epoch": 3.0676470588235296, "frac_reward_zero_std": 1.0, "grad_norm": 0.02531282976269722, "kl": 0.004781476745847613, "learning_rate": 9.913444802981045e-07, "loss": 4.842330235987902e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 475.8125, "completions/min_length": 384.0, "epoch": 3.0691176470588237, "frac_reward_zero_std": 0.5, "grad_norm": 1.1109683513641357, "kl": 0.004716119263321161, "learning_rate": 9.913206887418209e-07, "loss": 4.707530024461448e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/mean_length": 418.75, "completions/min_length": 376.0, "epoch": 3.070588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.009824620559811592, "kl": 0.00468753871973604, "learning_rate": 9.912968648185733e-07, "loss": 4.662636638386175e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 467.0, "completions/min_length": 416.0, "epoch": 3.072058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.1976970434188843, "kl": 0.003982192603871226, "learning_rate": 9.912730085299318e-07, "loss": 3.962218761444092e-05, "reward": 0.71875, "reward_std": 0.39963412284851074, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 2089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/mean_length": 501.375, "completions/min_length": 379.0, "epoch": 3.073529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.1886308193206787, "kl": 0.004465985926799476, "learning_rate": 9.912491198774678e-07, "loss": 4.501640796661377e-05, "reward": 0.8495416641235352, "reward_std": 0.26242613792419434, "rewards/DrugCombAccuracyCOTORM/mean": 0.8262500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.3118448555469513, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8854166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.2083333432674408, "step": 2090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/mean_length": 497.5625, "completions/min_length": 398.0, "epoch": 3.075, "frac_reward_zero_std": 0.5, "grad_norm": 0.8498550057411194, "kl": 0.004008090065326542, "learning_rate": 9.912251988627548e-07, "loss": 4.017327955807559e-05, "reward": 0.862500011920929, "reward_std": 0.17945021390914917, "rewards/DrugCombAccuracyCOTORM/mean": 0.8541666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.291070818901062, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5288001894950867, "step": 2091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 459.625, "completions/min_length": 370.0, "epoch": 3.0764705882352943, "frac_reward_zero_std": 0.5, "grad_norm": 0.9501562714576721, "kl": 0.006260956171900034, "learning_rate": 9.912012454873689e-07, "loss": 6.221642252057791e-05, "reward": 0.9750000238418579, "reward_std": 0.0707106739282608, "rewards/DrugCombAccuracyCOTORM/mean": 0.96875, "rewards/DrugCombAccuracyCOTORM/std": 0.125, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/mean_length": 502.4375, "completions/min_length": 469.0, "epoch": 3.0779411764705884, "frac_reward_zero_std": 1.0, "grad_norm": 0.01809665374457836, "kl": 0.0052662790403701365, "learning_rate": 9.911772597528881e-07, "loss": 5.2334795327624306e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 496.4375, "completions/min_length": 445.0, "epoch": 3.0794117647058825, "frac_reward_zero_std": 0.5, "grad_norm": 0.6965066194534302, "kl": 0.005091391853056848, "learning_rate": 9.911532416608924e-07, "loss": 5.1114031521137804e-05, "reward": 0.9178333282470703, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.9025000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.26642072200775146, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 2094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/mean_length": 513.5, "completions/min_length": 421.0, "epoch": 3.0808823529411766, "frac_reward_zero_std": 0.0, "grad_norm": 1.4511172771453857, "kl": 0.0052823322475887835, "learning_rate": 9.91129191212964e-07, "loss": 5.2988529205322266e-05, "reward": 0.9074000120162964, "reward_std": 0.26191234588623047, "rewards/DrugCombAccuracyCOTORM/mean": 0.887374997138977, "rewards/DrugCombAccuracyCOTORM/std": 0.3098659813404083, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9750000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.10000000149011612, "step": 2095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 431.3125, "completions/min_length": 378.0, "epoch": 3.0823529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.009234120137989521, "kl": 0.003786723711527884, "learning_rate": 9.911051084106876e-07, "loss": 3.773547359742224e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 483.0, "completions/min_length": 407.0, "epoch": 3.083823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.9516812562942505, "kl": 0.005425949231721461, "learning_rate": 9.910809932556493e-07, "loss": 5.3423518693307415e-05, "reward": 0.942187488079071, "reward_std": 0.16351844370365143, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 433.625, "completions/min_length": 384.0, "epoch": 3.085294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.0689235925674438, "kl": 0.005366204306483269, "learning_rate": 9.91056845749438e-07, "loss": 5.352106745704077e-05, "reward": 0.3812500238418579, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3125, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 2098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 401.5625, "completions/min_length": 309.0, "epoch": 3.086764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.4158294200897217, "kl": 0.005995742976665497, "learning_rate": 9.910326658936443e-07, "loss": 6.002187728881836e-05, "reward": 0.8767499923706055, "reward_std": 0.26834434270858765, "rewards/DrugCombAccuracyCOTORM/mean": 0.8537499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.31442803144454956, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.13437095284461975, "step": 2099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 428.1875, "completions/min_length": 399.0, "epoch": 3.088235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9506531953811646, "kl": 0.004319742671214044, "learning_rate": 9.910084536898613e-07, "loss": 4.303943569539115e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 422.3125, "completions/min_length": 332.0, "epoch": 3.0897058823529413, "frac_reward_zero_std": 0.5, "grad_norm": 0.8695918321609497, "kl": 0.005358197551686317, "learning_rate": 9.909842091396841e-07, "loss": 5.2852883527521044e-05, "reward": 0.887499988079071, "reward_std": 0.21001701056957245, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 452.4375, "completions/min_length": 407.0, "epoch": 3.0911764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 1.0346091985702515, "kl": 0.004470939165912569, "learning_rate": 9.909599322447096e-07, "loss": 4.462897777557373e-05, "reward": 0.8500000238418579, "reward_std": 0.20701967179775238, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 486.375, "completions/min_length": 427.0, "epoch": 3.0926470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.0067631276324391365, "kl": 0.003124808135908097, "learning_rate": 9.909356230065373e-07, "loss": 3.1524134101346135e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 465.625, "completions/min_length": 387.0, "epoch": 3.0941176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.4298235177993774, "kl": 0.004347649461124092, "learning_rate": 9.909112814267683e-07, "loss": 4.380941390991211e-05, "reward": 0.5249999761581421, "reward_std": 0.41661906242370605, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.25, "rewards/DrugCombCoverageCOTORM/std": 1.0, "step": 2104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 483.25, "completions/min_length": 403.0, "epoch": 3.0955882352941178, "frac_reward_zero_std": 0.5, "grad_norm": 0.9568169713020325, "kl": 0.00559191545471549, "learning_rate": 9.908869075070068e-07, "loss": 5.59632753720507e-05, "reward": 0.6079999804496765, "reward_std": 0.03527848422527313, "rewards/DrugCombAccuracyCOTORM/mean": 0.5412499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.47761037945747375, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 2105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 439.8125, "completions/min_length": 364.0, "epoch": 3.097058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.04942450672388077, "kl": 0.005271770409308374, "learning_rate": 9.908625012488578e-07, "loss": 5.295282608130947e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 475.25, "completions/min_length": 395.0, "epoch": 3.098529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.007676697801798582, "kl": 0.003940297348890454, "learning_rate": 9.908380626539297e-07, "loss": 3.940789247280918e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 483.9375, "completions/min_length": 447.0, "epoch": 3.1, "frac_reward_zero_std": 0.5, "grad_norm": 1.0859822034835815, "kl": 0.0045630287495441735, "learning_rate": 9.90813591723832e-07, "loss": 4.556775093078613e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 2108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 460.5625, "completions/min_length": 341.0, "epoch": 3.101470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9344930648803711, "kl": 0.004361131170298904, "learning_rate": 9.90789088460177e-07, "loss": 4.2743980884552e-05, "reward": 0.543749988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 464.25, "completions/min_length": 433.0, "epoch": 3.1029411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.1114857196807861, "kl": 0.0058721742243506014, "learning_rate": 9.907645528645789e-07, "loss": 5.891995897400193e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/mean_length": 479.75, "completions/min_length": 426.0, "epoch": 3.1044117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 1.4309556484222412, "kl": 0.005340011091902852, "learning_rate": 9.90739984938654e-07, "loss": 5.393475294113159e-05, "reward": 0.7552083730697632, "reward_std": 0.29749131202697754, "rewards/DrugCombAccuracyCOTORM/mean": 0.6979166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.43977582454681396, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 2111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 435.125, "completions/min_length": 378.0, "epoch": 3.1058823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.175890326499939, "kl": 0.019659636891447008, "learning_rate": 9.907153846840208e-07, "loss": 0.0002103114384226501, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 2112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 487.1875, "completions/min_length": 430.0, "epoch": 3.1073529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 0.8484170436859131, "kl": 0.004096372809726745, "learning_rate": 9.906907521023e-07, "loss": 4.093350798939355e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 439.875, "completions/min_length": 358.0, "epoch": 3.1088235294117648, "frac_reward_zero_std": 0.5, "grad_norm": 1.130751132965088, "kl": 0.007380058348644525, "learning_rate": 9.90666087195114e-07, "loss": 7.328391075134277e-05, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 2114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 453.5625, "completions/min_length": 375.0, "epoch": 3.110294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.010711517184972763, "kl": 0.004665388027206063, "learning_rate": 9.906413899640882e-07, "loss": 4.626041481969878e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 437.8125, "completions/min_length": 382.0, "epoch": 3.111764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.008314973674714565, "kl": 0.004188014951068908, "learning_rate": 9.906166604108493e-07, "loss": 4.158064257353544e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 439.0625, "completions/min_length": 390.0, "epoch": 3.113235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.3158656358718872, "kl": 0.005502448417246342, "learning_rate": 9.90591898537026e-07, "loss": 5.5402517318725586e-05, "reward": 0.7999999523162842, "reward_std": 0.3484410047531128, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 480.5, "completions/min_length": 422.0, "epoch": 3.114705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.1272929906845093, "kl": 0.004787376034073532, "learning_rate": 9.905671043442503e-07, "loss": 4.805624485015869e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 456.125, "completions/min_length": 394.0, "epoch": 3.1161764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.008828017860651016, "kl": 0.004010451142676175, "learning_rate": 9.905422778341553e-07, "loss": 4.025257658213377e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 2119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 481.5625, "completions/min_length": 397.0, "epoch": 3.1176470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.6037567853927612, "kl": 0.007186107919551432, "learning_rate": 9.905174190083762e-07, "loss": 7.093697786331177e-05, "reward": 0.65625, "reward_std": 0.47638368606567383, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.7274384498596191, "step": 2120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 501.125, "completions/min_length": 441.0, "epoch": 3.1191176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.2913461923599243, "kl": 0.006916578975506127, "learning_rate": 9.90492527868551e-07, "loss": 6.964802742004395e-05, "reward": 0.8158749938011169, "reward_std": 0.23165899515151978, "rewards/DrugCombAccuracyCOTORM/mean": 0.784166693687439, "rewards/DrugCombAccuracyCOTORM/std": 0.34634920954704285, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8854166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.17969882488250732, "step": 2121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 463.875, "completions/min_length": 419.0, "epoch": 3.1205882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.3318127393722534, "kl": 0.004446528619155288, "learning_rate": 9.904676044163193e-07, "loss": 4.4561922550201416e-05, "reward": 0.659375011920929, "reward_std": 0.12096155434846878, "rewards/DrugCombAccuracyCOTORM/mean": 0.59375, "rewards/DrugCombAccuracyCOTORM/std": 0.4552929699420929, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.5072392821311951, "step": 2122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/mean_length": 516.75, "completions/min_length": 427.0, "epoch": 3.1220588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.8491761684417725, "kl": 0.005499929538927972, "learning_rate": 9.90442648653323e-07, "loss": 5.486893860506825e-05, "reward": 0.5484374761581421, "reward_std": 0.0044194171205163, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 2123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 417.125, "completions/min_length": 365.0, "epoch": 3.123529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.012652144767343998, "kl": 0.00503587385173887, "learning_rate": 9.904176605812062e-07, "loss": 5.050349864177406e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 421.8125, "completions/min_length": 367.0, "epoch": 3.125, "frac_reward_zero_std": 0.5, "grad_norm": 0.994349479675293, "kl": 0.0041840397752821445, "learning_rate": 9.90392640201615e-07, "loss": 4.1643434087745845e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 476.3125, "completions/min_length": 426.0, "epoch": 3.126470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.4424331188201904, "kl": 0.005054834415204823, "learning_rate": 9.90367587516198e-07, "loss": 5.066394805908203e-05, "reward": 0.7749999761581421, "reward_std": 0.41661903262138367, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 2126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 453.9375, "completions/min_length": 393.0, "epoch": 3.1279411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.8995582461357117, "kl": 0.0051239123567938805, "learning_rate": 9.90342502526605e-07, "loss": 5.1319115300429985e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 421.6875, "completions/min_length": 380.0, "epoch": 3.1294117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.01089142169803381, "kl": 0.004595092730596662, "learning_rate": 9.903173852344887e-07, "loss": 4.6020075387787074e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/mean_length": 441.6875, "completions/min_length": 330.0, "epoch": 3.1308823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.0857622623443604, "kl": 0.0119020288111642, "learning_rate": 9.902922356415042e-07, "loss": 0.00012377649545669556, "reward": 0.6779999732971191, "reward_std": 0.14082613587379456, "rewards/DrugCombAccuracyCOTORM/mean": 0.6332142949104309, "rewards/DrugCombAccuracyCOTORM/std": 0.4318942725658417, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7142857313156128, "rewards/DrugCombCoverageCOTORM/std": 0.5030519366264343, "step": 2129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 460.375, "completions/min_length": 390.0, "epoch": 3.1323529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8972299695014954, "kl": 0.004535849904641509, "learning_rate": 9.902670537493079e-07, "loss": 4.529881334747188e-05, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 2130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/mean_length": 543.8125, "completions/min_length": 454.0, "epoch": 3.1338235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.5610501766204834, "kl": 0.005945155629888177, "learning_rate": 9.902418395595588e-07, "loss": 5.9641897678375244e-05, "reward": 0.5519999861717224, "reward_std": 0.2867504358291626, "rewards/DrugCombAccuracyCOTORM/mean": 0.4907812476158142, "rewards/DrugCombAccuracyCOTORM/std": 0.43457847833633423, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.59375, "rewards/DrugCombCoverageCOTORM/std": 0.3598804175853729, "step": 2131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 446.0, "completions/min_length": 368.0, "epoch": 3.135294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.008504515513777733, "kl": 0.004595695878379047, "learning_rate": 9.902165930739178e-07, "loss": 4.61655727121979e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/mean_length": 492.25, "completions/min_length": 358.0, "epoch": 3.136764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.8634651899337769, "kl": 0.006478263530880213, "learning_rate": 9.901913142940484e-07, "loss": 6.379187107086182e-05, "reward": 0.2109166532754898, "reward_std": 0.02946278266608715, "rewards/DrugCombAccuracyCOTORM/mean": 0.10999999940395355, "rewards/DrugCombAccuracyCOTORM/std": 0.1136075109243393, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.2291666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.4669642150402069, "step": 2133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/mean_length": 525.1875, "completions/min_length": 456.0, "epoch": 3.138235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8460144400596619, "kl": 0.005841952748596668, "learning_rate": 9.901660032216157e-07, "loss": 5.8002769947052e-05, "reward": 0.27916666865348816, "reward_std": 0.14246973395347595, "rewards/DrugCombAccuracyCOTORM/mean": 0.1145833358168602, "rewards/DrugCombAccuracyCOTORM/std": 0.24883991479873657, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 477.9375, "completions/min_length": 420.0, "epoch": 3.139705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.009401868097484112, "kl": 0.0036346660926938057, "learning_rate": 9.901406598582871e-07, "loss": 3.6445591831579804e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 481.5625, "completions/min_length": 418.0, "epoch": 3.1411764705882352, "frac_reward_zero_std": 0.5, "grad_norm": 0.7370755076408386, "kl": 0.00418758939485997, "learning_rate": 9.901152842057323e-07, "loss": 4.1923449316527694e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 471.4375, "completions/min_length": 409.0, "epoch": 3.1426470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.02389589138329029, "kl": 0.005744206719100475, "learning_rate": 9.90089876265623e-07, "loss": 5.746377428295091e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 434.4375, "completions/min_length": 398.0, "epoch": 3.1441176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.450327754020691, "kl": 0.00505713646998629, "learning_rate": 9.900644360396327e-07, "loss": 5.048513412475586e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 465.875, "completions/min_length": 411.0, "epoch": 3.1455882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.8253332376480103, "kl": 0.005672415834851563, "learning_rate": 9.900389635294378e-07, "loss": 5.6620981922606006e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 448.3125, "completions/min_length": 367.0, "epoch": 3.1470588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.077831506729126, "kl": 0.005024339654482901, "learning_rate": 9.90013458736716e-07, "loss": 5.019801028538495e-05, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 2140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/mean_length": 503.9375, "completions/min_length": 417.0, "epoch": 3.148529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.1405822038650513, "kl": 0.0058606594684533775, "learning_rate": 9.899879216631476e-07, "loss": 5.8949925005435944e-05, "reward": 0.6505833268165588, "reward_std": 0.14483553171157837, "rewards/DrugCombAccuracyCOTORM/mean": 0.597083330154419, "rewards/DrugCombAccuracyCOTORM/std": 0.46426665782928467, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7291666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.6800735592842102, "step": 2141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/mean_length": 392.5625, "completions/min_length": 342.0, "epoch": 3.15, "frac_reward_zero_std": 0.5, "grad_norm": 1.4182243347167969, "kl": 0.00463277637027204, "learning_rate": 9.899623523104148e-07, "loss": 4.6357530663954094e-05, "reward": 0.512499988079071, "reward_std": 0.0353553369641304, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 2142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 403.5625, "completions/min_length": 369.0, "epoch": 3.151470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.088843822479248, "kl": 0.0037289491738192737, "learning_rate": 9.899367506802021e-07, "loss": 3.7361889553721994e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 443.5, "completions/min_length": 370.0, "epoch": 3.152941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.016663171350955963, "kl": 0.005589492036961019, "learning_rate": 9.899111167741965e-07, "loss": 5.526895256480202e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 435.9375, "completions/min_length": 392.0, "epoch": 3.1544117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.014728120528161526, "kl": 0.003934944688808173, "learning_rate": 9.89885450594086e-07, "loss": 3.918215952580795e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 474.3125, "completions/min_length": 361.0, "epoch": 3.1558823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.9841768741607666, "kl": 0.003802750667091459, "learning_rate": 9.89859752141562e-07, "loss": 3.789365291595459e-05, "reward": 0.637499988079071, "reward_std": 0.1505940705537796, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 470.9375, "completions/min_length": 404.0, "epoch": 3.1573529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.8263794183731079, "kl": 0.004986682906746864, "learning_rate": 9.898340214183173e-07, "loss": 5.0356335123069584e-05, "reward": 0.6312500238418579, "reward_std": 0.15103808045387268, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 2147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 441.0, "completions/min_length": 402.0, "epoch": 3.1588235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.011434681713581085, "kl": 0.004778212518431246, "learning_rate": 9.898082584260465e-07, "loss": 4.738496500067413e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 422.5625, "completions/min_length": 377.0, "epoch": 3.1602941176470587, "frac_reward_zero_std": 1.0, "grad_norm": 0.01826479658484459, "kl": 0.005146522540599108, "learning_rate": 9.897824631664473e-07, "loss": 5.1695511501748115e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 479.9375, "completions/min_length": 415.0, "epoch": 3.161764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.1522077322006226, "kl": 0.00539726298302412, "learning_rate": 9.897566356412191e-07, "loss": 5.369615610106848e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 434.0625, "completions/min_length": 379.0, "epoch": 3.163235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9504344463348389, "kl": 0.0047919421922415495, "learning_rate": 9.89730775852063e-07, "loss": 4.8308058467227966e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 2151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 434.25, "completions/min_length": 374.0, "epoch": 3.164705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9642778635025024, "kl": 0.005286228784825653, "learning_rate": 9.89704883800683e-07, "loss": 5.32492995262146e-05, "reward": 0.942187488079071, "reward_std": 0.16351844370365143, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 463.9375, "completions/min_length": 404.0, "epoch": 3.166176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.1595631837844849, "kl": 0.007202984066680074, "learning_rate": 9.896789594887842e-07, "loss": 7.255375385284424e-05, "reward": 0.7534999847412109, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.7074999809265137, "rewards/DrugCombAccuracyCOTORM/std": 0.39000001549720764, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.1666666567325592, "step": 2153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 479.0625, "completions/min_length": 393.0, "epoch": 3.1676470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.006594481412321329, "kl": 0.004232516337651759, "learning_rate": 9.896530029180749e-07, "loss": 4.264235030859709e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 2154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 413.3125, "completions/min_length": 369.0, "epoch": 3.1691176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.025336777791380882, "kl": 0.004642659390810877, "learning_rate": 9.896270140902649e-07, "loss": 4.623688437277451e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 457.125, "completions/min_length": 382.0, "epoch": 3.1705882352941175, "frac_reward_zero_std": 0.5, "grad_norm": 1.2025257349014282, "kl": 0.004619330400601029, "learning_rate": 9.896009930070665e-07, "loss": 4.6584380470449105e-05, "reward": 0.9056999683380127, "reward_std": 0.17460967600345612, "rewards/DrugCombAccuracyCOTORM/mean": 0.8914999961853027, "rewards/DrugCombAccuracyCOTORM/std": 0.2964784502983093, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.925000011920929, "rewards/DrugCombCoverageCOTORM/std": 0.20493900775909424, "step": 2156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 443.1875, "completions/min_length": 295.0, "epoch": 3.1720588235294116, "frac_reward_zero_std": 0.5, "grad_norm": 1.351515293121338, "kl": 0.006758319272194058, "learning_rate": 9.895749396701935e-07, "loss": 6.882500019855797e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 495.3125, "completions/min_length": 454.0, "epoch": 3.1735294117647057, "frac_reward_zero_std": 0.5, "grad_norm": 1.0080095529556274, "kl": 0.004730493819806725, "learning_rate": 9.895488540813626e-07, "loss": 4.749372601509094e-05, "reward": 0.9358333349227905, "reward_std": 0.09250910580158234, "rewards/DrugCombAccuracyCOTORM/mean": 0.925000011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.17320507764816284, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.1666666567325592, "step": 2158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 420.9375, "completions/min_length": 372.0, "epoch": 3.175, "frac_reward_zero_std": 0.5, "grad_norm": 0.9615130424499512, "kl": 0.005594983580522239, "learning_rate": 9.89522736242292e-07, "loss": 5.5783231800887734e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 450.9375, "completions/min_length": 381.0, "epoch": 3.176470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.1029949188232422, "kl": 0.004403321247082204, "learning_rate": 9.894965861547022e-07, "loss": 4.411585541674867e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/mean_length": 541.25, "completions/min_length": 477.0, "epoch": 3.177941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.452441692352295, "kl": 0.006494981178548187, "learning_rate": 9.894704038203162e-07, "loss": 6.569921970367432e-05, "reward": 0.6169524192810059, "reward_std": 0.31460708379745483, "rewards/DrugCombAccuracyCOTORM/mean": 0.5628571510314941, "rewards/DrugCombAccuracyCOTORM/std": 0.46087610721588135, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.570899248123169, "step": 2161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/mean_length": 485.9375, "completions/min_length": 415.0, "epoch": 3.179411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.3786112070083618, "kl": 0.005235998309217393, "learning_rate": 9.894441892408588e-07, "loss": 5.234032869338989e-05, "reward": 0.6020833253860474, "reward_std": 0.2769501805305481, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.4425306022167206, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5208333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5439056158065796, "step": 2162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 438.0625, "completions/min_length": 392.0, "epoch": 3.1808823529411763, "frac_reward_zero_std": 1.0, "grad_norm": 0.012300155125558376, "kl": 0.004931763745844364, "learning_rate": 9.894179424180566e-07, "loss": 4.907175025437027e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 438.1875, "completions/min_length": 377.0, "epoch": 3.1823529411764704, "frac_reward_zero_std": 1.0, "grad_norm": 0.008247875608503819, "kl": 0.005002087214961648, "learning_rate": 9.893916633536392e-07, "loss": 4.960636579198763e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/mean_length": 506.125, "completions/min_length": 413.0, "epoch": 3.1838235294117645, "frac_reward_zero_std": 0.0, "grad_norm": 1.4410954713821411, "kl": 0.006665445165708661, "learning_rate": 9.893653520493372e-07, "loss": 6.601214408874512e-05, "reward": 0.5830000042915344, "reward_std": 0.45036330819129944, "rewards/DrugCombAccuracyCOTORM/mean": 0.5412499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.4801371693611145, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.6992058753967285, "step": 2165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 482.625, "completions/min_length": 403.0, "epoch": 3.185294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.1171752214431763, "kl": 0.005903906770981848, "learning_rate": 9.893390085068845e-07, "loss": 5.9545040130615234e-05, "reward": 0.9489166736602783, "reward_std": 0.11560136079788208, "rewards/DrugCombAccuracyCOTORM/mean": 0.9387500286102295, "rewards/DrugCombAccuracyCOTORM/std": 0.1980530321598053, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 2166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 466.625, "completions/min_length": 353.0, "epoch": 3.1867647058823527, "frac_reward_zero_std": 1.0, "grad_norm": 0.012121334671974182, "kl": 0.005669072852469981, "learning_rate": 9.893126327280162e-07, "loss": 5.681578113581054e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 448.75, "completions/min_length": 379.0, "epoch": 3.1882352941176473, "frac_reward_zero_std": 0.5, "grad_norm": 0.9528396129608154, "kl": 0.005016641225665808, "learning_rate": 9.8928622471447e-07, "loss": 4.9933791160583496e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 442.5, "completions/min_length": 349.0, "epoch": 3.189705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.048495613038539886, "kl": 0.0054187330533750355, "learning_rate": 9.892597844679855e-07, "loss": 5.3953375754645094e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 481.5, "completions/min_length": 400.0, "epoch": 3.1911764705882355, "frac_reward_zero_std": 0.5, "grad_norm": 0.9376713633537292, "kl": 0.004827350494451821, "learning_rate": 9.892333119903046e-07, "loss": 4.8547983169555664e-05, "reward": 0.6820833683013916, "reward_std": 0.13966327905654907, "rewards/DrugCombAccuracyCOTORM/mean": 0.6312500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.4371403455734253, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7708333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.3381595313549042, "step": 2170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 457.875, "completions/min_length": 400.0, "epoch": 3.1926470588235296, "frac_reward_zero_std": 1.0, "grad_norm": 0.01294051855802536, "kl": 0.0050488661509007215, "learning_rate": 9.89206807283171e-07, "loss": 5.062528725829907e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/mean_length": 482.875, "completions/min_length": 423.0, "epoch": 3.1941176470588237, "frac_reward_zero_std": 1.0, "grad_norm": 0.007911606691777706, "kl": 0.0047537460923194885, "learning_rate": 9.891802703483313e-07, "loss": 4.748746141558513e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 442.375, "completions/min_length": 415.0, "epoch": 3.195588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.01117384061217308, "kl": 0.00551450252532959, "learning_rate": 9.89153701187533e-07, "loss": 5.5135638831416145e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 470.5625, "completions/min_length": 393.0, "epoch": 3.197058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.033435314893722534, "kl": 0.005922136711888015, "learning_rate": 9.89127099802527e-07, "loss": 5.854213304701261e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 485.5625, "completions/min_length": 409.0, "epoch": 3.198529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.3919540643692017, "kl": 0.006489703198894858, "learning_rate": 9.891004661950653e-07, "loss": 6.451644730987027e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 458.625, "completions/min_length": 404.0, "epoch": 3.2, "frac_reward_zero_std": 0.5, "grad_norm": 1.0702874660491943, "kl": 0.004466686688829213, "learning_rate": 9.890738003669027e-07, "loss": 4.466758400667459e-05, "reward": 0.8374999761581421, "reward_std": 0.22638463973999023, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 457.5625, "completions/min_length": 426.0, "epoch": 3.2014705882352943, "frac_reward_zero_std": 1.0, "grad_norm": 0.007698835339397192, "kl": 0.003635982982814312, "learning_rate": 9.89047102319796e-07, "loss": 3.6359164369059727e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 459.9375, "completions/min_length": 402.0, "epoch": 3.2029411764705884, "frac_reward_zero_std": 1.0, "grad_norm": 3.0077357292175293, "kl": 0.051560524152591825, "learning_rate": 9.890203720555037e-07, "loss": 0.0005039876559749246, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 463.9375, "completions/min_length": 397.0, "epoch": 3.2044117647058825, "frac_reward_zero_std": 1.0, "grad_norm": 0.012210349552333355, "kl": 0.005672762752510607, "learning_rate": 9.889936095757867e-07, "loss": 5.7606805057730526e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 481.4375, "completions/min_length": 417.0, "epoch": 3.2058823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 0.948506236076355, "kl": 0.004845793591812253, "learning_rate": 9.889668148824083e-07, "loss": 4.834495484828949e-05, "reward": 0.7403278350830078, "reward_std": 0.17843830585479736, "rewards/DrugCombAccuracyCOTORM/mean": 0.7109999656677246, "rewards/DrugCombAccuracyCOTORM/std": 0.4084637761116028, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7152777910232544, "rewards/DrugCombCoverageCOTORM/std": 0.4951929748058319, "step": 2180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 456.4375, "completions/min_length": 408.0, "epoch": 3.2073529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.01032730471342802, "kl": 0.005211992422118783, "learning_rate": 9.889399879771337e-07, "loss": 5.2247294661356136e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 453.5, "completions/min_length": 386.0, "epoch": 3.208823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.019225431606173515, "kl": 0.00510162825230509, "learning_rate": 9.8891312886173e-07, "loss": 5.093141953693703e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 451.5625, "completions/min_length": 402.0, "epoch": 3.210294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.014641319401562214, "kl": 0.0053128430736251175, "learning_rate": 9.888862375379664e-07, "loss": 5.350272112991661e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/mean_length": 507.0625, "completions/min_length": 443.0, "epoch": 3.211764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.2594877481460571, "kl": 0.004860107903368771, "learning_rate": 9.888593140076152e-07, "loss": 4.916638135910034e-05, "reward": 0.5392500162124634, "reward_std": 0.20774057507514954, "rewards/DrugCombAccuracyCOTORM/mean": 0.47874999046325684, "rewards/DrugCombAccuracyCOTORM/std": 0.4790041744709015, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.4425306022167206, "step": 2184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 489.0625, "completions/min_length": 405.0, "epoch": 3.213235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.016023168340325356, "kl": 0.005037094699218869, "learning_rate": 9.888323582724492e-07, "loss": 5.058824899606407e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 467.125, "completions/min_length": 373.0, "epoch": 3.2147058823529413, "frac_reward_zero_std": 1.0, "grad_norm": 0.011634975671768188, "kl": 0.004044672939926386, "learning_rate": 9.888053703342445e-07, "loss": 4.058679041918367e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 478.25, "completions/min_length": 427.0, "epoch": 3.2161764705882354, "frac_reward_zero_std": 0.0, "grad_norm": 1.5326887369155884, "kl": 0.00610992475412786, "learning_rate": 9.88778350194779e-07, "loss": 6.039440631866455e-05, "reward": 0.5375000238418579, "reward_std": 0.2781112790107727, "rewards/DrugCombAccuracyCOTORM/mean": 0.53125, "rewards/DrugCombAccuracyCOTORM/std": 0.3400367796421051, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 2187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/mean_length": 506.0, "completions/min_length": 410.0, "epoch": 3.2176470588235295, "frac_reward_zero_std": 0.0, "grad_norm": 1.4603334665298462, "kl": 0.005075453198514879, "learning_rate": 9.887512978558328e-07, "loss": 5.08427619934082e-05, "reward": 0.67658931016922, "reward_std": 0.25137951970100403, "rewards/DrugCombAccuracyCOTORM/mean": 0.5996428728103638, "rewards/DrugCombAccuracyCOTORM/std": 0.36567243933677673, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 2188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 499.4375, "completions/min_length": 453.0, "epoch": 3.2191176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.376861333847046, "kl": 0.005406105541624129, "learning_rate": 9.88724213319188e-07, "loss": 5.447492003440857e-05, "reward": 0.28993332386016846, "reward_std": 0.282772958278656, "rewards/DrugCombAccuracyCOTORM/mean": 0.25824999809265137, "rewards/DrugCombAccuracyCOTORM/std": 0.44349634647369385, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.1666666567325592, "rewards/DrugCombCoverageCOTORM/std": 0.8432741165161133, "step": 2189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 449.5, "completions/min_length": 387.0, "epoch": 3.2205882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.006267306860536337, "kl": 0.004435877490323037, "learning_rate": 9.886970965866288e-07, "loss": 4.42167220171541e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/mean_length": 548.125, "completions/min_length": 481.0, "epoch": 3.222058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.5333538055419922, "kl": 0.007763785542920232, "learning_rate": 9.886699476599417e-07, "loss": 7.571280002593994e-05, "reward": 0.7124999761581421, "reward_std": 0.41731178760528564, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.6191391944885254, "step": 2191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 445.0, "completions/min_length": 368.0, "epoch": 3.223529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.007203398738056421, "kl": 0.0035165005829185247, "learning_rate": 9.886427665409149e-07, "loss": 3.51407979906071e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 427.5625, "completions/min_length": 373.0, "epoch": 3.225, "frac_reward_zero_std": 0.5, "grad_norm": 1.054118275642395, "kl": 0.005467925337143242, "learning_rate": 9.886155532313395e-07, "loss": 5.46090304851532e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 2193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 461.25, "completions/min_length": 416.0, "epoch": 3.226470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9563319683074951, "kl": 0.0054329747799783945, "learning_rate": 9.885883077330077e-07, "loss": 5.427838914329186e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 404.375, "completions/min_length": 355.0, "epoch": 3.2279411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.008439545519649982, "kl": 0.004577756451908499, "learning_rate": 9.885610300477149e-07, "loss": 4.553369581117295e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 466.625, "completions/min_length": 395.0, "epoch": 3.2294117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.014914215542376041, "kl": 0.004819887573830783, "learning_rate": 9.885337201772576e-07, "loss": 4.7571280447300524e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 2196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 448.1875, "completions/min_length": 397.0, "epoch": 3.2308823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.2091929912567139, "kl": 0.005047998041845858, "learning_rate": 9.885063781234354e-07, "loss": 5.016792783862911e-05, "reward": 0.942187488079071, "reward_std": 0.16351844370365143, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 434.1875, "completions/min_length": 393.0, "epoch": 3.2323529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.0068462747149169445, "kl": 0.003597201546654105, "learning_rate": 9.884790038880491e-07, "loss": 3.576388553483412e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 422.8125, "completions/min_length": 377.0, "epoch": 3.2338235294117648, "frac_reward_zero_std": 0.5, "grad_norm": 1.0588597059249878, "kl": 0.005142880487255752, "learning_rate": 9.884515974729023e-07, "loss": 5.1635041018016636e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 2199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 468.6875, "completions/min_length": 420.0, "epoch": 3.235294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.0090996026992798, "kl": 0.005107111413963139, "learning_rate": 9.884241588798003e-07, "loss": 5.099177360534668e-05, "reward": 0.4375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 2200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/mean_length": 501.0, "completions/min_length": 412.0, "epoch": 3.236764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.1040292978286743, "kl": 0.005147037096321583, "learning_rate": 9.883966881105507e-07, "loss": 5.1528215408325195e-05, "reward": 0.6418541669845581, "reward_std": 0.0722048208117485, "rewards/DrugCombAccuracyCOTORM/mean": 0.5796614289283752, "rewards/DrugCombAccuracyCOTORM/std": 0.44902774691581726, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.78125, "rewards/DrugCombCoverageCOTORM/std": 0.23935680091381073, "step": 2201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 446.3125, "completions/min_length": 398.0, "epoch": 3.238235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9862183332443237, "kl": 0.005291525856591761, "learning_rate": 9.883691851669634e-07, "loss": 5.2772462368011475e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 425.5, "completions/min_length": 356.0, "epoch": 3.239705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9625582098960876, "kl": 0.005501567909959704, "learning_rate": 9.883416500508502e-07, "loss": 5.441532994154841e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 474.1875, "completions/min_length": 393.0, "epoch": 3.2411764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.010680443607270718, "kl": 0.004804105032235384, "learning_rate": 9.883140827640245e-07, "loss": 4.8081546992762014e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 2204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/mean_length": 477.75, "completions/min_length": 333.0, "epoch": 3.2426470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.1810157299041748, "kl": 0.0048254145658575, "learning_rate": 9.882864833083034e-07, "loss": 4.8179579607676715e-05, "reward": 0.663754940032959, "reward_std": 0.10638092458248138, "rewards/DrugCombAccuracyCOTORM/mean": 0.5906311273574829, "rewards/DrugCombAccuracyCOTORM/std": 0.4586677551269531, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9125000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.12583057582378387, "step": 2205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 443.4375, "completions/min_length": 351.0, "epoch": 3.2441176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.396237850189209, "kl": 0.004377368197310716, "learning_rate": 9.88258851685504e-07, "loss": 4.408136010169983e-05, "reward": 0.4437500238418579, "reward_std": 0.3625184893608093, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/mean_length": 511.5625, "completions/min_length": 445.0, "epoch": 3.2455882352941177, "frac_reward_zero_std": 0.0, "grad_norm": 1.4594813585281372, "kl": 0.006495249108411372, "learning_rate": 9.882311878974472e-07, "loss": 6.35385513305664e-05, "reward": 0.9151785373687744, "reward_std": 0.18964314460754395, "rewards/DrugCombAccuracyCOTORM/mean": 0.9017857313156128, "rewards/DrugCombAccuracyCOTORM/std": 0.24863573908805847, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 456.9375, "completions/min_length": 365.0, "epoch": 3.2470588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9211888909339905, "kl": 0.005560600897297263, "learning_rate": 9.882034919459554e-07, "loss": 5.587935447692871e-05, "reward": 0.879687488079071, "reward_std": 0.22300077974796295, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5439056158065796, "step": 2208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 515.3125, "completions/min_length": 434.0, "epoch": 3.248529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.008464776910841465, "kl": 0.004137263807933778, "learning_rate": 9.88175763832853e-07, "loss": 4.1084880649577826e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 2209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 431.9375, "completions/min_length": 374.0, "epoch": 3.25, "frac_reward_zero_std": 1.0, "grad_norm": 0.010539914481341839, "kl": 0.005888826213777065, "learning_rate": 9.881480035599666e-07, "loss": 5.8225363318342716e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 488.875, "completions/min_length": 431.0, "epoch": 3.251470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.029874620959162712, "kl": 0.006505894009023905, "learning_rate": 9.881202111291252e-07, "loss": 6.54185569146648e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 478.125, "completions/min_length": 411.0, "epoch": 3.2529411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.4864782094955444, "kl": 0.00656938599422574, "learning_rate": 9.880923865421596e-07, "loss": 6.465613842010498e-05, "reward": 0.5464166402816772, "reward_std": 0.18804451823234558, "rewards/DrugCombAccuracyCOTORM/mean": 0.45124998688697815, "rewards/DrugCombAccuracyCOTORM/std": 0.502684473991394, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.2713136672973633, "step": 2212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 442.9375, "completions/min_length": 383.0, "epoch": 3.2544117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.008518745191395283, "kl": 0.0041240136488340795, "learning_rate": 9.880645298009026e-07, "loss": 4.1206170863006264e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/mean_length": 502.125, "completions/min_length": 408.0, "epoch": 3.2558823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.7526246309280396, "kl": 0.004574510618112981, "learning_rate": 9.880366409071897e-07, "loss": 4.62457537651062e-05, "reward": 0.6437000036239624, "reward_std": 0.014699229039251804, "rewards/DrugCombAccuracyCOTORM/mean": 0.5806666612625122, "rewards/DrugCombAccuracyCOTORM/std": 0.4337928295135498, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.22360680997371674, "step": 2214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 459.6875, "completions/min_length": 408.0, "epoch": 3.2573529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.012415680103003979, "kl": 0.004855539300478995, "learning_rate": 9.880087198628577e-07, "loss": 4.877504761680029e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 468.625, "completions/min_length": 407.0, "epoch": 3.2588235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.7821053862571716, "kl": 0.00440220080781728, "learning_rate": 9.879807666697464e-07, "loss": 4.3593376176431775e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/mean_length": 493.1875, "completions/min_length": 421.0, "epoch": 3.260294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.2129011154174805, "kl": 0.006178511423058808, "learning_rate": 9.879527813296972e-07, "loss": 6.230175495147705e-05, "reward": 0.3384583592414856, "reward_std": 0.37138816714286804, "rewards/DrugCombAccuracyCOTORM/mean": 0.2394791692495346, "rewards/DrugCombAccuracyCOTORM/std": 0.4126391112804413, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.46875, "rewards/DrugCombCoverageCOTORM/std": 0.8844725489616394, "step": 2217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 432.25, "completions/min_length": 329.0, "epoch": 3.261764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.009035157039761543, "kl": 0.005155974184162915, "learning_rate": 9.879247638445535e-07, "loss": 5.194294499233365e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 445.8125, "completions/min_length": 385.0, "epoch": 3.263235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9254565834999084, "kl": 0.005193110089749098, "learning_rate": 9.878967142161614e-07, "loss": 5.252659320831299e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 451.5, "completions/min_length": 392.0, "epoch": 3.264705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.011724001727998257, "kl": 0.004932766372803599, "learning_rate": 9.878686324463682e-07, "loss": 4.916525358567014e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/mean_length": 541.375, "completions/min_length": 463.0, "epoch": 3.2661764705882352, "frac_reward_zero_std": 0.0, "grad_norm": 1.2129015922546387, "kl": 0.0060062711127102375, "learning_rate": 9.878405185370243e-07, "loss": 6.098300218582153e-05, "reward": 0.550000011920929, "reward_std": 0.24440310895442963, "rewards/DrugCombAccuracyCOTORM/mean": 0.46875, "rewards/DrugCombAccuracyCOTORM/std": 0.4989572763442993, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.547722578048706, "step": 2221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 487.625, "completions/min_length": 429.0, "epoch": 3.2676470588235293, "frac_reward_zero_std": 0.0, "grad_norm": 1.305741310119629, "kl": 0.005092008621431887, "learning_rate": 9.878123724899813e-07, "loss": 5.067884922027588e-05, "reward": 0.5214166641235352, "reward_std": 0.2225801944732666, "rewards/DrugCombAccuracyCOTORM/mean": 0.45124998688697815, "rewards/DrugCombAccuracyCOTORM/std": 0.502684473991394, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6041666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.6465721726417542, "step": 2222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/mean_length": 468.125, "completions/min_length": 357.0, "epoch": 3.2691176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.009585955180227757, "kl": 0.003795821452513337, "learning_rate": 9.877841943070938e-07, "loss": 3.79966659238562e-05, "reward": 0.8583333492279053, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.8333333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.17213258147239685, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.08606630563735962, "step": 2223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 440.6875, "completions/min_length": 385.0, "epoch": 3.2705882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.9353020191192627, "kl": 0.005120980436913669, "learning_rate": 9.877559839902183e-07, "loss": 5.1169809012208134e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 2224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 460.25, "completions/min_length": 429.0, "epoch": 3.2720588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.010411368682980537, "kl": 0.005178366438485682, "learning_rate": 9.877277415412128e-07, "loss": 5.152373341843486e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/mean_length": 409.625, "completions/min_length": 372.0, "epoch": 3.273529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.015409675426781178, "kl": 0.005400120629929006, "learning_rate": 9.87699466961938e-07, "loss": 5.3307659982237965e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 2226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 468.0625, "completions/min_length": 407.0, "epoch": 3.275, "frac_reward_zero_std": 1.0, "grad_norm": 0.011411577463150024, "kl": 0.0051122785662300885, "learning_rate": 9.876711602542563e-07, "loss": 5.1558825362008065e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 484.25, "completions/min_length": 411.0, "epoch": 3.276470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.1588460206985474, "kl": 0.00468511797953397, "learning_rate": 9.87642821420033e-07, "loss": 4.716110561275855e-05, "reward": 0.7509583234786987, "reward_std": 0.22535203397274017, "rewards/DrugCombAccuracyCOTORM/mean": 0.7394791841506958, "rewards/DrugCombAccuracyCOTORM/std": 0.42602020502090454, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.59375, "rewards/DrugCombCoverageCOTORM/std": 0.7122440934181213, "step": 2228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 473.625, "completions/min_length": 420.0, "epoch": 3.277941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.263617992401123, "kl": 0.004497582674957812, "learning_rate": 9.876144504611342e-07, "loss": 4.4994056224823e-05, "reward": 0.7256875038146973, "reward_std": 0.11083897948265076, "rewards/DrugCombAccuracyCOTORM/mean": 0.6707812547683716, "rewards/DrugCombAccuracyCOTORM/std": 0.38554149866104126, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.890625, "rewards/DrugCombCoverageCOTORM/std": 0.1280868947505951, "step": 2229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 445.75, "completions/min_length": 377.0, "epoch": 3.2794117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.02170746587216854, "kl": 0.006001400877721608, "learning_rate": 9.875860473794299e-07, "loss": 5.9557121858233586e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 443.0625, "completions/min_length": 377.0, "epoch": 3.2808823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.8858919143676758, "kl": 0.004881042754277587, "learning_rate": 9.875576121767907e-07, "loss": 4.8697227612137794e-05, "reward": 0.9197291731834412, "reward_std": 0.14868797361850739, "rewards/DrugCombAccuracyCOTORM/mean": 0.9042187333106995, "rewards/DrugCombAccuracyCOTORM/std": 0.26177236437797546, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9635416865348816, "rewards/DrugCombCoverageCOTORM/std": 0.10077822208404541, "step": 2231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 447.8125, "completions/min_length": 386.0, "epoch": 3.2823529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.010631080716848373, "kl": 0.005153308855369687, "learning_rate": 9.875291448550898e-07, "loss": 5.186433554627001e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 426.625, "completions/min_length": 362.0, "epoch": 3.2838235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.018834887072443962, "kl": 0.0051784447859972715, "learning_rate": 9.875006454162025e-07, "loss": 5.166229675523937e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 440.875, "completions/min_length": 401.0, "epoch": 3.2852941176470587, "frac_reward_zero_std": 0.5, "grad_norm": 1.0744469165802002, "kl": 0.007993252482265234, "learning_rate": 9.874721138620064e-07, "loss": 8.006119605852291e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 2234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 459.375, "completions/min_length": 388.0, "epoch": 3.286764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9379593133926392, "kl": 0.004563307797070593, "learning_rate": 9.874435501943812e-07, "loss": 4.589136005961336e-05, "reward": 0.9588750004768372, "reward_std": 0.05675788223743439, "rewards/DrugCombAccuracyCOTORM/mean": 0.9524999856948853, "rewards/DrugCombAccuracyCOTORM/std": 0.10212194174528122, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.06718549132347107, "step": 2235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 436.875, "completions/min_length": 393.0, "epoch": 3.288235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8629820942878723, "kl": 0.006198094459250569, "learning_rate": 9.874149544152085e-07, "loss": 6.184672383824363e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/mean_length": 493.8125, "completions/min_length": 378.0, "epoch": 3.289705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.1415901184082031, "kl": 0.006994615308940411, "learning_rate": 9.87386326526372e-07, "loss": 7.067620754241943e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 458.625, "completions/min_length": 380.0, "epoch": 3.291176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.1088676452636719, "kl": 0.005009666085243225, "learning_rate": 9.873576665297575e-07, "loss": 4.976987838745117e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 437.4375, "completions/min_length": 355.0, "epoch": 3.2926470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 0.9794887900352478, "kl": 0.005466443719342351, "learning_rate": 9.873289744272535e-07, "loss": 5.4858624935150146e-05, "reward": 0.6499999761581421, "reward_std": 0.21876275539398193, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8164966106414795, "step": 2239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 493.875, "completions/min_length": 440.0, "epoch": 3.2941176470588234, "frac_reward_zero_std": 0.5, "grad_norm": 1.0683226585388184, "kl": 0.005028056621085852, "learning_rate": 9.873002502207502e-07, "loss": 4.992166577721946e-05, "reward": 0.9154167175292969, "reward_std": 0.03417681157588959, "rewards/DrugCombAccuracyCOTORM/mean": 0.9125000238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.10246950387954712, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.17078250646591187, "step": 2240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 445.875, "completions/min_length": 397.0, "epoch": 3.2955882352941175, "frac_reward_zero_std": 1.0, "grad_norm": 0.010738982819020748, "kl": 0.004898814542684704, "learning_rate": 9.872714939121392e-07, "loss": 4.8634858103469014e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 442.1875, "completions/min_length": 386.0, "epoch": 3.2970588235294116, "frac_reward_zero_std": 1.0, "grad_norm": 0.041827864944934845, "kl": 0.006012925296090543, "learning_rate": 9.872427055033155e-07, "loss": 6.0175054386490956e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 428.4375, "completions/min_length": 359.0, "epoch": 3.2985294117647057, "frac_reward_zero_std": 1.0, "grad_norm": 0.07045195251703262, "kl": 0.0058656728360801935, "learning_rate": 9.872138849961753e-07, "loss": 5.788650014437735e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 453.8125, "completions/min_length": 399.0, "epoch": 3.3, "frac_reward_zero_std": 1.0, "grad_norm": 0.008478938601911068, "kl": 0.0039028141763992608, "learning_rate": 9.871850323926177e-07, "loss": 3.882903911289759e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/mean_length": 517.3125, "completions/min_length": 439.0, "epoch": 3.301470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.0096064805984497, "kl": 0.0043121917406097054, "learning_rate": 9.871561476945428e-07, "loss": 4.3079257011413574e-05, "reward": 0.65625, "reward_std": 0.21286731958389282, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.6291528940200806, "step": 2245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 479.9375, "completions/min_length": 394.0, "epoch": 3.302941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.888909101486206, "kl": 0.007050838437862694, "learning_rate": 9.871272309038535e-07, "loss": 7.048994302749634e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 2246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 434.1875, "completions/min_length": 392.0, "epoch": 3.304411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.05972683057188988, "kl": 0.005113926134072244, "learning_rate": 9.870982820224554e-07, "loss": 5.045998113928363e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 438.375, "completions/min_length": 368.0, "epoch": 3.3058823529411763, "frac_reward_zero_std": 0.5, "grad_norm": 0.9398600459098816, "kl": 0.0043500344036147, "learning_rate": 9.870693010522551e-07, "loss": 4.263789742253721e-05, "reward": 0.687416672706604, "reward_std": 0.04549054056406021, "rewards/DrugCombAccuracyCOTORM/mean": 0.6274999976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.39061489701271057, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.17078250646591187, "step": 2248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 429.5, "completions/min_length": 395.0, "epoch": 3.307352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.009435203857719898, "kl": 0.004062444786541164, "learning_rate": 9.870402879951616e-07, "loss": 4.0577593608759344e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 468.9375, "completions/min_length": 421.0, "epoch": 3.3088235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 1.1653060913085938, "kl": 0.005512479227036238, "learning_rate": 9.870112428530869e-07, "loss": 5.508575850399211e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 452.5, "completions/min_length": 407.0, "epoch": 3.310294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.005702013149857521, "kl": 0.003912860061973333, "learning_rate": 9.869821656279435e-07, "loss": 3.918730726581998e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 470.8125, "completions/min_length": 408.0, "epoch": 3.3117647058823527, "frac_reward_zero_std": 0.5, "grad_norm": 0.9400215148925781, "kl": 0.004659817088395357, "learning_rate": 9.869530563216476e-07, "loss": 4.669278860092163e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 442.75, "completions/min_length": 398.0, "epoch": 3.3132352941176473, "frac_reward_zero_std": 0.5, "grad_norm": 1.218403697013855, "kl": 0.004955271841026843, "learning_rate": 9.869239149361168e-07, "loss": 4.940051076118834e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 429.0, "completions/min_length": 367.0, "epoch": 3.314705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.09228318184614182, "kl": 0.006767075567040592, "learning_rate": 9.868947414732709e-07, "loss": 6.67741333018057e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 464.9375, "completions/min_length": 398.0, "epoch": 3.3161764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.00962594710290432, "kl": 0.0044711654772982, "learning_rate": 9.868655359350313e-07, "loss": 4.478578921407461e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 404.1875, "completions/min_length": 362.0, "epoch": 3.317647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.013436496257781982, "kl": 0.005026252765674144, "learning_rate": 9.868362983233224e-07, "loss": 4.965080734109506e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 434.9375, "completions/min_length": 394.0, "epoch": 3.3191176470588237, "frac_reward_zero_std": 0.5, "grad_norm": 0.8848539590835571, "kl": 0.006300045759417117, "learning_rate": 9.8680702864007e-07, "loss": 6.258969369810075e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 437.5625, "completions/min_length": 380.0, "epoch": 3.320588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.030687961727380753, "kl": 0.004448508727364242, "learning_rate": 9.867777268872028e-07, "loss": 4.446320963324979e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 488.5625, "completions/min_length": 433.0, "epoch": 3.322058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.0661696195602417, "kl": 0.004820840142201632, "learning_rate": 9.867483930666506e-07, "loss": 4.8523852456128225e-05, "reward": 0.31666669249534607, "reward_std": 0.2335033267736435, "rewards/DrugCombAccuracyCOTORM/mean": 0.2708333432674408, "rewards/DrugCombAccuracyCOTORM/std": 0.4254627227783203, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 0.632455587387085, "step": 2259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 461.125, "completions/min_length": 409.0, "epoch": 3.323529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9611304402351379, "kl": 0.0038302166503854096, "learning_rate": 9.867190271803463e-07, "loss": 3.835931420326233e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 444.625, "completions/min_length": 395.0, "epoch": 3.325, "frac_reward_zero_std": 0.5, "grad_norm": 1.0248162746429443, "kl": 0.00577998033259064, "learning_rate": 9.866896292302242e-07, "loss": 5.801556835649535e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 2261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 420.25, "completions/min_length": 363.0, "epoch": 3.3264705882352943, "frac_reward_zero_std": 0.5, "grad_norm": 1.2659138441085815, "kl": 0.005204756977036595, "learning_rate": 9.866601992182208e-07, "loss": 5.1903698476962745e-05, "reward": 0.6493333578109741, "reward_std": 0.040736082941293716, "rewards/DrugCombAccuracyCOTORM/mean": 0.5824999809265137, "rewards/DrugCombAccuracyCOTORM/std": 0.43676844239234924, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 2262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 460.625, "completions/min_length": 403.0, "epoch": 3.3279411764705884, "frac_reward_zero_std": 0.5, "grad_norm": 1.112765908241272, "kl": 0.005096300155855715, "learning_rate": 9.866307371462752e-07, "loss": 5.111098289489746e-05, "reward": 0.578416645526886, "reward_std": 0.17607644200325012, "rewards/DrugCombAccuracyCOTORM/mean": 0.5693749785423279, "rewards/DrugCombAccuracyCOTORM/std": 0.5049814581871033, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.2291666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.9867174029350281, "step": 2263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 460.1875, "completions/min_length": 436.0, "epoch": 3.3294117647058825, "frac_reward_zero_std": 0.5, "grad_norm": 1.5488289594650269, "kl": 0.008067308110184968, "learning_rate": 9.866012430163282e-07, "loss": 7.980316877365112e-05, "reward": 0.8250000476837158, "reward_std": 0.0707106739282608, "rewards/DrugCombAccuracyCOTORM/mean": 0.78125, "rewards/DrugCombAccuracyCOTORM/std": 0.2561737895011902, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 428.875, "completions/min_length": 388.0, "epoch": 3.3308823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 1.1961579322814941, "kl": 0.005402757786214352, "learning_rate": 9.865717168303226e-07, "loss": 5.427951691672206e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 449.875, "completions/min_length": 374.0, "epoch": 3.3323529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.008898855186998844, "kl": 0.004299363994505256, "learning_rate": 9.865421585902037e-07, "loss": 4.299183638067916e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 451.25, "completions/min_length": 379.0, "epoch": 3.333823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.020022394135594368, "kl": 0.006191105116158724, "learning_rate": 9.865125682979187e-07, "loss": 6.149945693323389e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 486.8125, "completions/min_length": 440.0, "epoch": 3.335294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.8847929239273071, "kl": 0.012973813922144473, "learning_rate": 9.86482945955417e-07, "loss": 0.00013708813639823347, "reward": 0.7534999847412109, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.7074999809265137, "rewards/DrugCombAccuracyCOTORM/std": 0.39000001549720764, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.1666666567325592, "step": 2268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 467.375, "completions/min_length": 405.0, "epoch": 3.336764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8169549107551575, "kl": 0.006436158670112491, "learning_rate": 9.864532915646498e-07, "loss": 6.455092079704627e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 487.5, "completions/min_length": 434.0, "epoch": 3.338235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01953563280403614, "kl": 0.007050194661132991, "learning_rate": 9.864236051275707e-07, "loss": 7.096212357282639e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 446.1875, "completions/min_length": 400.0, "epoch": 3.3397058823529413, "frac_reward_zero_std": 1.0, "grad_norm": 0.017399633303284645, "kl": 0.004905663256067783, "learning_rate": 9.863938866461358e-07, "loss": 4.9379345000488684e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 447.75, "completions/min_length": 396.0, "epoch": 3.3411764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 1.2148292064666748, "kl": 0.006139460834674537, "learning_rate": 9.863641361223024e-07, "loss": 6.163865327835083e-05, "reward": 0.6979166865348816, "reward_std": 0.16479605436325073, "rewards/DrugCombAccuracyCOTORM/mean": 0.6770833730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.39190301299095154, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.7932003140449524, "step": 2272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 491.6875, "completions/min_length": 392.0, "epoch": 3.3426470588235295, "frac_reward_zero_std": 0.0, "grad_norm": 1.6088907718658447, "kl": 0.004963678773492575, "learning_rate": 9.863343535580304e-07, "loss": 4.978477954864502e-05, "reward": 0.848312497138977, "reward_std": 0.3395412266254425, "rewards/DrugCombAccuracyCOTORM/mean": 0.8279687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.3735184073448181, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.859375, "rewards/DrugCombCoverageCOTORM/std": 0.341183602809906, "step": 2273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 459.1875, "completions/min_length": 402.0, "epoch": 3.3441176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.0071015069261193275, "kl": 0.003979266155511141, "learning_rate": 9.863045389552822e-07, "loss": 4.004852962680161e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 432.375, "completions/min_length": 362.0, "epoch": 3.3455882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.02948288805782795, "kl": 0.0060649525839835405, "learning_rate": 9.862746923160215e-07, "loss": 6.1017548432573676e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 434.375, "completions/min_length": 349.0, "epoch": 3.347058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.0897085666656494, "kl": 0.007852739188820124, "learning_rate": 9.862448136422149e-07, "loss": 7.889017433626577e-05, "reward": 0.5625, "reward_std": 0.051754921674728394, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 2276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 423.9375, "completions/min_length": 379.0, "epoch": 3.348529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.13995161652565002, "kl": 0.00831416476285085, "learning_rate": 9.862149029358302e-07, "loss": 8.495345537085086e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 440.4375, "completions/min_length": 409.0, "epoch": 3.35, "frac_reward_zero_std": 1.0, "grad_norm": 0.007030718959867954, "kl": 0.004345076042227447, "learning_rate": 9.861849601988383e-07, "loss": 4.359548256616108e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 442.8125, "completions/min_length": 391.0, "epoch": 3.351470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.012069248594343662, "kl": 0.005560969468206167, "learning_rate": 9.861549854332117e-07, "loss": 5.545093154069036e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 470.375, "completions/min_length": 413.0, "epoch": 3.3529411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.031012622639536858, "kl": 0.006716644857078791, "learning_rate": 9.861249786409248e-07, "loss": 6.837068212917075e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 467.5, "completions/min_length": 423.0, "epoch": 3.3544117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.0801817178726196, "kl": 0.006272448576055467, "learning_rate": 9.860949398239544e-07, "loss": 6.274600309552625e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 443.0, "completions/min_length": 385.0, "epoch": 3.3558823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.9272788763046265, "kl": 0.005002841469831765, "learning_rate": 9.860648689842798e-07, "loss": 4.9591064453125e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 412.0, "completions/min_length": 352.0, "epoch": 3.3573529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.008065075613558292, "kl": 0.004710317822173238, "learning_rate": 9.860347661238816e-07, "loss": 4.700130011769943e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 489.625, "completions/min_length": 429.0, "epoch": 3.3588235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.008081668056547642, "kl": 0.00507921539247036, "learning_rate": 9.86004631244743e-07, "loss": 5.10340032633394e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 485.8125, "completions/min_length": 435.0, "epoch": 3.360294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.1804676055908203, "kl": 0.004455970891285688, "learning_rate": 9.859744643488493e-07, "loss": 4.3944455683231354e-05, "reward": 0.6421874761581421, "reward_std": 0.22350071370601654, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.8920949101448059, "step": 2285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 476.5625, "completions/min_length": 382.0, "epoch": 3.361764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.2707306146621704, "kl": 0.0070271153235808015, "learning_rate": 9.859442654381876e-07, "loss": 6.899070285726339e-05, "reward": 0.7250000238418579, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 2286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 444.875, "completions/min_length": 357.0, "epoch": 3.363235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.013622713275253773, "kl": 0.0068246477749198675, "learning_rate": 9.859140345147477e-07, "loss": 6.785480945836753e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 516.1875, "completions/min_length": 441.0, "epoch": 3.364705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.01056449394673109, "kl": 0.004715730436146259, "learning_rate": 9.858837715805207e-07, "loss": 4.750976950163022e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 2288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 497.5625, "completions/min_length": 448.0, "epoch": 3.3661764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.660383939743042, "kl": 0.009835488570388407, "learning_rate": 9.858534766375005e-07, "loss": 9.71667468547821e-05, "reward": 0.625, "reward_std": 0.4475547671318054, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 2289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 472.875, "completions/min_length": 417.0, "epoch": 3.3676470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.7634105086326599, "kl": 0.010481565666850656, "learning_rate": 9.858231496876827e-07, "loss": 0.00010902911890298128, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/mean_length": 486.0625, "completions/min_length": 406.0, "epoch": 3.3691176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.7442827820777893, "kl": 0.004489853396080434, "learning_rate": 9.857927907330653e-07, "loss": 4.427134990692139e-05, "reward": 0.8583333492279053, "reward_std": 0.12567278742790222, "rewards/DrugCombAccuracyCOTORM/mean": 0.8229166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.28198206424713135, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/mean_length": 468.5, "completions/min_length": 386.0, "epoch": 3.3705882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.850273072719574, "kl": 0.004986472486052662, "learning_rate": 9.857623997756484e-07, "loss": 4.9992195272352546e-05, "reward": 0.9375, "reward_std": 0.14078858494758606, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 452.6875, "completions/min_length": 415.0, "epoch": 3.3720588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9865339398384094, "kl": 0.00667951162904501, "learning_rate": 9.857319768174337e-07, "loss": 6.685760308755562e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/mean_length": 525.75, "completions/min_length": 424.0, "epoch": 3.373529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.905448317527771, "kl": 0.006427702493965626, "learning_rate": 9.857015218604259e-07, "loss": 6.37248158454895e-05, "reward": 0.40940120816230774, "reward_std": 0.23897263407707214, "rewards/DrugCombAccuracyCOTORM/mean": 0.2890952527523041, "rewards/DrugCombAccuracyCOTORM/std": 0.3718123435974121, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.78125, "rewards/DrugCombCoverageCOTORM/std": 0.19690898060798645, "step": 2294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 466.8125, "completions/min_length": 423.0, "epoch": 3.375, "frac_reward_zero_std": 1.0, "grad_norm": 0.0059689427725970745, "kl": 0.003279406519141048, "learning_rate": 9.856710349066307e-07, "loss": 3.295375427114777e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 445.5625, "completions/min_length": 407.0, "epoch": 3.376470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.012133529409766197, "kl": 0.004457195347640663, "learning_rate": 9.856405159580568e-07, "loss": 4.4464290112955496e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 478.0625, "completions/min_length": 405.0, "epoch": 3.3779411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.0424256324768066, "kl": 0.005335778929293156, "learning_rate": 9.85609965016715e-07, "loss": 5.310028791427612e-05, "reward": 0.8167083263397217, "reward_std": 0.07059283554553986, "rewards/DrugCombAccuracyCOTORM/mean": 0.8008333444595337, "rewards/DrugCombAccuracyCOTORM/std": 0.2410086989402771, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7604166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.25069350004196167, "step": 2297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/mean_length": 441.0, "completions/min_length": 361.0, "epoch": 3.3794117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.4192253351211548, "kl": 0.008850952028296888, "learning_rate": 9.855793820846174e-07, "loss": 8.891527977539226e-05, "reward": 0.5926250219345093, "reward_std": 0.0896388590335846, "rewards/DrugCombAccuracyCOTORM/mean": 0.5415624976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.4888480603694916, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.59375, "rewards/DrugCombCoverageCOTORM/std": 0.4905354380607605, "step": 2298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 478.875, "completions/min_length": 417.0, "epoch": 3.3808823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 1.4984421730041504, "kl": 0.006297028390690684, "learning_rate": 9.85548767163779e-07, "loss": 6.35981559753418e-05, "reward": 0.5874999761581421, "reward_std": 0.2920154333114624, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 428.0625, "completions/min_length": 377.0, "epoch": 3.3823529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.008325201459228992, "kl": 0.004544136463664472, "learning_rate": 9.855181202562167e-07, "loss": 4.550071025732905e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 458.875, "completions/min_length": 421.0, "epoch": 3.3838235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8972020149230957, "kl": 0.004700818040873855, "learning_rate": 9.85487441363949e-07, "loss": 4.688650369644165e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/mean_length": 486.75, "completions/min_length": 409.0, "epoch": 3.385294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.0501039028167725, "kl": 0.005229498725384474, "learning_rate": 9.854567304889975e-07, "loss": 5.3048133850097656e-05, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 2302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/mean_length": 473.5625, "completions/min_length": 372.0, "epoch": 3.386764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 35.22617721557617, "kl": 0.2698329881532118, "learning_rate": 9.854259876333852e-07, "loss": 0.0024368390440940857, "reward": 0.7706667184829712, "reward_std": 0.18519249558448792, "rewards/DrugCombAccuracyCOTORM/mean": 0.73416668176651, "rewards/DrugCombAccuracyCOTORM/std": 0.38874441385269165, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5018484592437744, "step": 2303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 463.4375, "completions/min_length": 389.0, "epoch": 3.388235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.014313633553683758, "kl": 0.00641566701233387, "learning_rate": 9.853952127991372e-07, "loss": 6.406210013665259e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 428.8125, "completions/min_length": 355.0, "epoch": 3.389705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.009300137870013714, "kl": 0.004691691952757537, "learning_rate": 9.85364405988281e-07, "loss": 4.729607826448046e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 485.5, "completions/min_length": 386.0, "epoch": 3.3911764705882352, "frac_reward_zero_std": 0.5, "grad_norm": 1.202980399131775, "kl": 0.00631842075381428, "learning_rate": 9.85333567202846e-07, "loss": 6.321445107460022e-05, "reward": 0.5943750143051147, "reward_std": 0.06884078681468964, "rewards/DrugCombAccuracyCOTORM/mean": 0.551562488079071, "rewards/DrugCombAccuracyCOTORM/std": 0.4697107970714569, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.53125, "rewards/DrugCombCoverageCOTORM/std": 0.6884463429450989, "step": 2306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 468.8125, "completions/min_length": 422.0, "epoch": 3.3926470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.01143635157495737, "kl": 0.004950280650518835, "learning_rate": 9.853026964448638e-07, "loss": 4.915734098176472e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 469.0, "completions/min_length": 392.0, "epoch": 3.3941176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 1.5041965246200562, "kl": 0.004421756661031395, "learning_rate": 9.852717937163683e-07, "loss": 4.406273365020752e-05, "reward": 0.762499988079071, "reward_std": 0.42026329040527344, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 2308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 477.4375, "completions/min_length": 437.0, "epoch": 3.3955882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.7775794267654419, "kl": 0.003994041413534433, "learning_rate": 9.85240859019395e-07, "loss": 3.996607119916007e-05, "reward": 0.5874999761581421, "reward_std": 0.172688826918602, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 2309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 451.625, "completions/min_length": 382.0, "epoch": 3.3970588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.007698793895542622, "kl": 0.003596042573917657, "learning_rate": 9.852098923559817e-07, "loss": 3.5565019061323255e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 480.125, "completions/min_length": 418.0, "epoch": 3.398529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9416216611862183, "kl": 0.005405664618592709, "learning_rate": 9.85178893728169e-07, "loss": 5.4545700550079346e-05, "reward": 0.6854166984558105, "reward_std": 0.20647984743118286, "rewards/DrugCombAccuracyCOTORM/mean": 0.6770833730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.4323439598083496, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.8732125163078308, "step": 2311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 478.875, "completions/min_length": 386.0, "epoch": 3.4, "frac_reward_zero_std": 1.0, "grad_norm": 0.008958714082837105, "kl": 0.004072434385307133, "learning_rate": 9.851478631379982e-07, "loss": 4.08074411097914e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 2312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 414.875, "completions/min_length": 359.0, "epoch": 3.401470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9363846182823181, "kl": 0.003978122258558869, "learning_rate": 9.851168005875142e-07, "loss": 3.984241993748583e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 443.3125, "completions/min_length": 403.0, "epoch": 3.402941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.007773107383400202, "kl": 0.004193104454316199, "learning_rate": 9.850857060787628e-07, "loss": 4.184149656794034e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/mean_length": 486.5625, "completions/min_length": 421.0, "epoch": 3.4044117647058822, "frac_reward_zero_std": 0.0, "grad_norm": 1.5516159534454346, "kl": 0.007881595403887331, "learning_rate": 9.850545796137929e-07, "loss": 7.802248001098633e-05, "reward": 0.606249988079071, "reward_std": 0.36740854382514954, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 2315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 459.5625, "completions/min_length": 410.0, "epoch": 3.4058823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.011291260831058025, "kl": 0.005233787233009934, "learning_rate": 9.850234211946548e-07, "loss": 5.1835188060067594e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/mean_length": 477.6875, "completions/min_length": 378.0, "epoch": 3.4073529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.0622012615203857, "kl": 0.004418124328367412, "learning_rate": 9.84992230823401e-07, "loss": 4.407763481140137e-05, "reward": 0.7541667222976685, "reward_std": 0.1560296267271042, "rewards/DrugCombAccuracyCOTORM/mean": 0.7083333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.3824869990348816, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 2317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/mean_length": 515.25, "completions/min_length": 392.0, "epoch": 3.4088235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 0.9587967991828918, "kl": 0.005223532789386809, "learning_rate": 9.849610085020865e-07, "loss": 5.254767165752128e-05, "reward": 0.8452291488647461, "reward_std": 0.15892790257930756, "rewards/DrugCombAccuracyCOTORM/mean": 0.8123958110809326, "rewards/DrugCombAccuracyCOTORM/std": 0.32724323868751526, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.953125, "rewards/DrugCombCoverageCOTORM/std": 0.10077822208404541, "step": 2318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 451.5, "completions/min_length": 372.0, "epoch": 3.4102941176470587, "frac_reward_zero_std": 0.5, "grad_norm": 0.9124370813369751, "kl": 0.005740518448874354, "learning_rate": 9.84929754232768e-07, "loss": 5.766749382019043e-05, "reward": 0.5874999761581421, "reward_std": 0.172688826918602, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 2319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 450.75, "completions/min_length": 374.0, "epoch": 3.411764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.2122058868408203, "kl": 0.005921735777519643, "learning_rate": 9.848984680175048e-07, "loss": 5.951523780822754e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/mean_length": 369.3125, "completions/min_length": 319.0, "epoch": 3.413235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.008310921490192413, "kl": 0.0038038388593122363, "learning_rate": 9.848671498583572e-07, "loss": 3.754008503165096e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/mean_length": 519.5625, "completions/min_length": 428.0, "epoch": 3.414705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9690060615539551, "kl": 0.00497090513817966, "learning_rate": 9.848357997573892e-07, "loss": 5.013660847907886e-05, "reward": 0.5636945962905884, "reward_std": 0.07143524289131165, "rewards/DrugCombAccuracyCOTORM/mean": 0.5303994417190552, "rewards/DrugCombAccuracyCOTORM/std": 0.4884902536869049, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.39375001192092896, "rewards/DrugCombCoverageCOTORM/std": 0.8504655957221985, "step": 2322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 436.625, "completions/min_length": 352.0, "epoch": 3.416176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.0678898096084595, "kl": 0.006229043588973582, "learning_rate": 9.848044177166654e-07, "loss": 6.121769547462463e-05, "reward": 0.8062499761581421, "reward_std": 0.20848803222179413, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.3435921370983124, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.7932003140449524, "step": 2323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 432.125, "completions/min_length": 397.0, "epoch": 3.4176470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.007380648050457239, "kl": 0.0036133293760940433, "learning_rate": 9.847730037382535e-07, "loss": 3.6168366932542995e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/mean_length": 490.9375, "completions/min_length": 418.0, "epoch": 3.4191176470588234, "frac_reward_zero_std": 0.5, "grad_norm": 1.1398102045059204, "kl": 0.007434439845383167, "learning_rate": 9.847415578242231e-07, "loss": 7.540670776506886e-05, "reward": 0.9239000082015991, "reward_std": 0.1409098207950592, "rewards/DrugCombAccuracyCOTORM/mean": 0.9079999923706055, "rewards/DrugCombAccuracyCOTORM/std": 0.25139185786247253, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9750000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.06831300258636475, "step": 2325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 467.625, "completions/min_length": 407.0, "epoch": 3.4205882352941175, "frac_reward_zero_std": 0.5, "grad_norm": 0.9536342024803162, "kl": 0.0053108567371964455, "learning_rate": 9.847100799766454e-07, "loss": 5.301833152770996e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 430.625, "completions/min_length": 387.0, "epoch": 3.4220588235294116, "frac_reward_zero_std": 1.0, "grad_norm": 0.01329903956502676, "kl": 0.004730508429929614, "learning_rate": 9.846785701975944e-07, "loss": 4.736606206279248e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 458.125, "completions/min_length": 343.0, "epoch": 3.4235294117647057, "frac_reward_zero_std": 0.0, "grad_norm": 1.6235052347183228, "kl": 0.0066496930085122585, "learning_rate": 9.846470284891456e-07, "loss": 6.640329957008362e-05, "reward": 0.38199999928474426, "reward_std": 0.30222249031066895, "rewards/DrugCombAccuracyCOTORM/mean": 0.3004166781902313, "rewards/DrugCombAccuracyCOTORM/std": 0.4260444641113281, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 2328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 693.0, "completions/mean_length": 513.1875, "completions/min_length": 434.0, "epoch": 3.425, "frac_reward_zero_std": 0.0, "grad_norm": 1.324292778968811, "kl": 0.005038626724854112, "learning_rate": 9.846154548533772e-07, "loss": 5.003809928894043e-05, "reward": 0.5505833625793457, "reward_std": 0.1213446632027626, "rewards/DrugCombAccuracyCOTORM/mean": 0.45125001668930054, "rewards/DrugCombAccuracyCOTORM/std": 0.502684473991394, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.12484557926654816, "step": 2329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 427.75, "completions/min_length": 390.0, "epoch": 3.426470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.011650734581053257, "kl": 0.005301948287524283, "learning_rate": 9.84583849292369e-07, "loss": 5.308965046424419e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/mean_length": 497.6875, "completions/min_length": 370.0, "epoch": 3.427941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.363430380821228, "kl": 0.007363828714005649, "learning_rate": 9.84552211808203e-07, "loss": 7.342991011682898e-05, "reward": 0.8832916617393494, "reward_std": 0.07431169599294662, "rewards/DrugCombAccuracyCOTORM/mean": 0.8593229055404663, "rewards/DrugCombAccuracyCOTORM/std": 0.1912601888179779, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.07453560829162598, "step": 2331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 451.9375, "completions/min_length": 366.0, "epoch": 3.429411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.06388720124959946, "kl": 0.007318688090890646, "learning_rate": 9.845205424029637e-07, "loss": 7.448336691595614e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 478.875, "completions/min_length": 441.0, "epoch": 3.4308823529411763, "frac_reward_zero_std": 1.0, "grad_norm": 0.014051917009055614, "kl": 0.006145160528831184, "learning_rate": 9.844888410787374e-07, "loss": 6.0862876125611365e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 450.3125, "completions/min_length": 382.0, "epoch": 3.432352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.017556626349687576, "kl": 0.005818380566779524, "learning_rate": 9.84457107837612e-07, "loss": 5.841037636855617e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 2334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 416.75, "completions/min_length": 381.0, "epoch": 3.4338235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.01824399083852768, "kl": 0.005105867865495384, "learning_rate": 9.844253426816784e-07, "loss": 5.0896196626126766e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/mean_length": 469.625, "completions/min_length": 410.0, "epoch": 3.435294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9327583312988281, "kl": 0.005262524471618235, "learning_rate": 9.843935456130293e-07, "loss": 5.288973625283688e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 2336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 462.125, "completions/min_length": 395.0, "epoch": 3.4367647058823527, "frac_reward_zero_std": 0.0, "grad_norm": 1.321940302848816, "kl": 0.004132242756895721, "learning_rate": 9.843617166337592e-07, "loss": 4.156678915023804e-05, "reward": 0.7671874761581421, "reward_std": 0.4330177903175354, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.704154372215271, "step": 2337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/mean_length": 512.5625, "completions/min_length": 420.0, "epoch": 3.4382352941176473, "frac_reward_zero_std": 0.5, "grad_norm": 0.9371845722198486, "kl": 0.00610708340536803, "learning_rate": 9.84329855745965e-07, "loss": 6.07222318649292e-05, "reward": 0.8070625066757202, "reward_std": 0.228886678814888, "rewards/DrugCombAccuracyCOTORM/mean": 0.7881250381469727, "rewards/DrugCombAccuracyCOTORM/std": 0.4028104543685913, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.78125, "rewards/DrugCombCoverageCOTORM/std": 0.5399545431137085, "step": 2338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 416.75, "completions/min_length": 373.0, "epoch": 3.439705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.008500923402607441, "kl": 0.003954656363930553, "learning_rate": 9.842979629517456e-07, "loss": 3.9259532059077173e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 423.5625, "completions/min_length": 336.0, "epoch": 3.4411764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.0077723730355501175, "kl": 0.003909045190084726, "learning_rate": 9.84266038253202e-07, "loss": 3.8448408304248005e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/mean_length": 484.1875, "completions/min_length": 428.0, "epoch": 3.442647058823529, "frac_reward_zero_std": 0.0, "grad_norm": 1.470871925354004, "kl": 0.004838444641791284, "learning_rate": 9.842340816524372e-07, "loss": 4.83393669128418e-05, "reward": 0.8047499656677246, "reward_std": 0.3418703079223633, "rewards/DrugCombAccuracyCOTORM/mean": 0.784583330154419, "rewards/DrugCombAccuracyCOTORM/std": 0.373108834028244, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7708333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5087202787399292, "step": 2341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 456.1875, "completions/min_length": 413.0, "epoch": 3.4441176470588237, "frac_reward_zero_std": 0.5, "grad_norm": 1.0181026458740234, "kl": 0.004926148918457329, "learning_rate": 9.842020931515568e-07, "loss": 4.924088716506958e-05, "reward": 0.5699374675750732, "reward_std": 0.09652291983366013, "rewards/DrugCombAccuracyCOTORM/mean": 0.5464062690734863, "rewards/DrugCombAccuracyCOTORM/std": 0.4765719473361969, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.328125, "rewards/DrugCombCoverageCOTORM/std": 0.9296897053718567, "step": 2342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 461.9375, "completions/min_length": 390.0, "epoch": 3.445588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.7189080715179443, "kl": 0.0051288867252878845, "learning_rate": 9.841700727526677e-07, "loss": 5.0838032620958984e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 452.5, "completions/min_length": 363.0, "epoch": 3.447058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.0718315839767456, "kl": 0.004750690422952175, "learning_rate": 9.841380204578794e-07, "loss": 4.756450653076172e-05, "reward": 0.960812509059906, "reward_std": 0.11083897948265076, "rewards/DrugCombAccuracyCOTORM/mean": 0.9529687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.18812499940395355, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.0625, "step": 2344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 434.4375, "completions/min_length": 358.0, "epoch": 3.448529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.012726196087896824, "kl": 0.00542739755474031, "learning_rate": 9.841059362693035e-07, "loss": 5.4098643886391073e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 458.4375, "completions/min_length": 341.0, "epoch": 3.45, "frac_reward_zero_std": 1.0, "grad_norm": 0.011839701794087887, "kl": 0.005663007439579815, "learning_rate": 9.840738201890539e-07, "loss": 5.567783955484629e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 447.3125, "completions/min_length": 373.0, "epoch": 3.4514705882352943, "frac_reward_zero_std": 0.5, "grad_norm": 1.1912832260131836, "kl": 0.00878350785933435, "learning_rate": 9.840416722192458e-07, "loss": 8.72397213242948e-05, "reward": 0.5291666984558105, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.5208333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.38429832458496094, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 2347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 433.0625, "completions/min_length": 378.0, "epoch": 3.4529411764705884, "frac_reward_zero_std": 0.5, "grad_norm": 0.9090607762336731, "kl": 0.004133323731366545, "learning_rate": 9.840094923619975e-07, "loss": 4.135361814405769e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 467.25, "completions/min_length": 419.0, "epoch": 3.4544117647058825, "frac_reward_zero_std": 0.5, "grad_norm": 1.1564782857894897, "kl": 0.0055190479615703225, "learning_rate": 9.839772806194286e-07, "loss": 5.5247481213882565e-05, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 2349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 441.5, "completions/min_length": 389.0, "epoch": 3.4558823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 0.9892612099647522, "kl": 0.00479130883468315, "learning_rate": 9.839450369936612e-07, "loss": 4.8453963245265186e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 452.3125, "completions/min_length": 324.0, "epoch": 3.4573529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 1.1003824472427368, "kl": 0.004785005701705813, "learning_rate": 9.839127614868195e-07, "loss": 4.790649109054357e-05, "reward": 0.8531249761581421, "reward_std": 0.1809881180524826, "rewards/DrugCombAccuracyCOTORM/mean": 0.84375, "rewards/DrugCombAccuracyCOTORM/std": 0.3010398745536804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.78125, "rewards/DrugCombCoverageCOTORM/std": 0.5153881907463074, "step": 2351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 446.5, "completions/min_length": 423.0, "epoch": 3.458823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.8267003297805786, "kl": 0.004766578727867454, "learning_rate": 9.8388045410103e-07, "loss": 4.751235246658325e-05, "reward": 0.5625, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 2352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 426.8125, "completions/min_length": 392.0, "epoch": 3.460294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.009497123770415783, "kl": 0.0051777290645986795, "learning_rate": 9.838481148384204e-07, "loss": 5.186230919207446e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 440.3125, "completions/min_length": 373.0, "epoch": 3.461764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.7187722325325012, "kl": 0.005043245619162917, "learning_rate": 9.838157437011214e-07, "loss": 5.0455331802368164e-05, "reward": 0.6626666784286499, "reward_std": 0.024513036012649536, "rewards/DrugCombAccuracyCOTORM/mean": 0.602142870426178, "rewards/DrugCombAccuracyCOTORM/std": 0.4120253622531891, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8095238208770752, "rewards/DrugCombCoverageCOTORM/std": 0.21717627346515656, "step": 2354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 451.75, "completions/min_length": 407.0, "epoch": 3.463235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8648449778556824, "kl": 0.004227102326694876, "learning_rate": 9.837833406912658e-07, "loss": 4.212185740470886e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 459.5625, "completions/min_length": 397.0, "epoch": 3.4647058823529413, "frac_reward_zero_std": 0.5, "grad_norm": 0.9758389592170715, "kl": 0.005911703803576529, "learning_rate": 9.837509058109878e-07, "loss": 5.917996168136597e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/mean_length": 480.625, "completions/min_length": 419.0, "epoch": 3.4661764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 1.1119426488876343, "kl": 0.005549886263906956, "learning_rate": 9.837184390624247e-07, "loss": 5.6803226470947266e-05, "reward": 0.8126979470252991, "reward_std": 0.024521542713046074, "rewards/DrugCombAccuracyCOTORM/mean": 0.7785677313804626, "rewards/DrugCombAccuracyCOTORM/std": 0.23512694239616394, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8984375, "rewards/DrugCombCoverageCOTORM/std": 0.16594897210597992, "step": 2357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 449.5625, "completions/min_length": 419.0, "epoch": 3.4676470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.1203484535217285, "kl": 0.005998550157528371, "learning_rate": 9.836859404477148e-07, "loss": 6.021250374033116e-05, "reward": 0.512499988079071, "reward_std": 0.0353553369641304, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 2358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/mean_length": 464.5625, "completions/min_length": 381.0, "epoch": 3.4691176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.9616506099700928, "kl": 0.0068011703551746905, "learning_rate": 9.83653409968999e-07, "loss": 6.648661656072363e-05, "reward": 0.675000011920929, "reward_std": 0.20528724789619446, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 2359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 441.875, "completions/min_length": 376.0, "epoch": 3.4705882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.010756258852779865, "kl": 0.005192446173168719, "learning_rate": 9.836208476284206e-07, "loss": 5.207550202612765e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/mean_length": 537.9375, "completions/min_length": 491.0, "epoch": 3.472058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.0600873231887817, "kl": 0.005098112334962934, "learning_rate": 9.835882534281248e-07, "loss": 5.101541682961397e-05, "reward": 0.875, "reward_std": 0.1035098284482956, "rewards/DrugCombAccuracyCOTORM/mean": 0.84375, "rewards/DrugCombAccuracyCOTORM/std": 0.23935678601264954, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 414.0625, "completions/min_length": 314.0, "epoch": 3.473529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.010124096646904945, "kl": 0.004441597673576325, "learning_rate": 9.835556273702587e-07, "loss": 4.387355875223875e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 484.5625, "completions/min_length": 394.0, "epoch": 3.475, "frac_reward_zero_std": 1.0, "grad_norm": 0.05072404071688652, "kl": 0.006315817357972264, "learning_rate": 9.835229694569715e-07, "loss": 6.378669786499813e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 475.6875, "completions/min_length": 396.0, "epoch": 3.476470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.0357385016977787, "kl": 0.005263926228508353, "learning_rate": 9.834902796904148e-07, "loss": 5.2434981625992805e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 437.5625, "completions/min_length": 360.0, "epoch": 3.4779411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.108889102935791, "kl": 0.009171761223115027, "learning_rate": 9.83457558072742e-07, "loss": 9.048497304320335e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 474.0, "completions/min_length": 408.0, "epoch": 3.4794117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.3161656856536865, "kl": 0.00587460957467556, "learning_rate": 9.834248046061086e-07, "loss": 5.838274955749512e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 459.0, "completions/min_length": 344.0, "epoch": 3.4808823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.040962349623441696, "kl": 0.005876188399270177, "learning_rate": 9.833920192926725e-07, "loss": 5.913774293730967e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 418.375, "completions/min_length": 327.0, "epoch": 3.4823529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.017864495515823364, "kl": 0.0047811626573093235, "learning_rate": 9.833592021345937e-07, "loss": 4.8127130867214873e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 436.25, "completions/min_length": 384.0, "epoch": 3.4838235294117648, "frac_reward_zero_std": 0.5, "grad_norm": 1.1573878526687622, "kl": 0.00583265267778188, "learning_rate": 9.833263531340336e-07, "loss": 5.827664790558629e-05, "reward": 0.30000001192092896, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 2369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 419.3125, "completions/min_length": 319.0, "epoch": 3.485294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.022121574729681015, "kl": 0.008008434437215328, "learning_rate": 9.832934722931566e-07, "loss": 8.076047379290685e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 423.25, "completions/min_length": 378.0, "epoch": 3.486764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.009972166270017624, "kl": 0.0035399021580815315, "learning_rate": 9.832605596141291e-07, "loss": 3.533781273290515e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 2371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 468.0, "completions/min_length": 358.0, "epoch": 3.488235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.786149263381958, "kl": 0.005443378002382815, "learning_rate": 9.832276150991184e-07, "loss": 5.474872887134552e-05, "reward": 0.33541667461395264, "reward_std": 0.30375364422798157, "rewards/DrugCombAccuracyCOTORM/mean": 0.2708333432674408, "rewards/DrugCombAccuracyCOTORM/std": 0.4425306022167206, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.1875000149011612, "rewards/DrugCombCoverageCOTORM/std": 0.9658521413803101, "step": 2372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 429.6875, "completions/min_length": 395.0, "epoch": 3.489705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 11.994820594787598, "kl": 0.06931622698903084, "learning_rate": 9.831946387502956e-07, "loss": 0.0006655305624008179, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 473.0, "completions/min_length": 432.0, "epoch": 3.4911764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9388191103935242, "kl": 0.004914164426736534, "learning_rate": 9.831616305698326e-07, "loss": 4.8986210458679125e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 438.75, "completions/min_length": 352.0, "epoch": 3.4926470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.009562128223478794, "kl": 0.004753996967338026, "learning_rate": 9.831285905599044e-07, "loss": 4.80273156426847e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/mean_length": 417.875, "completions/min_length": 360.0, "epoch": 3.4941176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.011415909975767136, "kl": 0.005941484589129686, "learning_rate": 9.83095518722687e-07, "loss": 5.8947684010490775e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/mean_length": 534.25, "completions/min_length": 464.0, "epoch": 3.4955882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.8344290852546692, "kl": 0.004902589716948569, "learning_rate": 9.830624150603595e-07, "loss": 4.906952381134033e-05, "reward": 0.5287333130836487, "reward_std": 0.1754155158996582, "rewards/DrugCombAccuracyCOTORM/mean": 0.4348750114440918, "rewards/DrugCombAccuracyCOTORM/std": 0.37228500843048096, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8083333373069763, "rewards/DrugCombCoverageCOTORM/std": 0.15939700603485107, "step": 2377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 432.6875, "completions/min_length": 392.0, "epoch": 3.4970588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.7237756848335266, "kl": 0.004906031768769026, "learning_rate": 9.830292795751027e-07, "loss": 4.8980116844177246e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 691.0, "completions/mean_length": 544.1875, "completions/min_length": 452.0, "epoch": 3.498529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8616049885749817, "kl": 0.007062231074087322, "learning_rate": 9.82996112269099e-07, "loss": 7.15023634256795e-05, "reward": 0.8709976673126221, "reward_std": 0.1649823635816574, "rewards/DrugCombAccuracyCOTORM/mean": 0.846038818359375, "rewards/DrugCombAccuracyCOTORM/std": 0.31054195761680603, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9416666626930237, "rewards/DrugCombCoverageCOTORM/std": 0.15515822172164917, "step": 2379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 452.125, "completions/min_length": 391.0, "epoch": 3.5, "frac_reward_zero_std": 0.5, "grad_norm": 1.0151447057724, "kl": 0.005209976574406028, "learning_rate": 9.82962913144534e-07, "loss": 5.195505218580365e-05, "reward": 0.5367083549499512, "reward_std": 0.18580015003681183, "rewards/DrugCombAccuracyCOTORM/mean": 0.4534375071525574, "rewards/DrugCombAccuracyCOTORM/std": 0.3811548948287964, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7395833730697632, "rewards/DrugCombCoverageCOTORM/std": 0.19214914739131927, "step": 2380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/mean_length": 470.125, "completions/min_length": 336.0, "epoch": 3.501470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.6946433186531067, "kl": 0.005752969766035676, "learning_rate": 9.829296822035946e-07, "loss": 5.796551704406738e-05, "reward": 0.8270833492279053, "reward_std": 0.1119585856795311, "rewards/DrugCombAccuracyCOTORM/mean": 0.7916666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.2687419056892395, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 482.75, "completions/min_length": 419.0, "epoch": 3.5029411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.039965808391571045, "kl": 0.0067302664974704385, "learning_rate": 9.828964194484696e-07, "loss": 6.818956171628088e-05, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 2382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 454.125, "completions/min_length": 417.0, "epoch": 3.5044117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.014807458966970444, "kl": 0.005836008116602898, "learning_rate": 9.82863124881351e-07, "loss": 5.80177475058008e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 484.75, "completions/min_length": 409.0, "epoch": 3.5058823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.9662545919418335, "kl": 0.004784385906532407, "learning_rate": 9.828297985044312e-07, "loss": 4.778849324793555e-05, "reward": 0.9114583730697632, "reward_std": 0.0733194574713707, "rewards/DrugCombAccuracyCOTORM/mean": 0.8958333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.15957117080688477, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9479166269302368, "rewards/DrugCombCoverageCOTORM/std": 0.07978560030460358, "step": 2384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 438.25, "completions/min_length": 384.0, "epoch": 3.5073529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.4306398630142212, "kl": 0.005960044451057911, "learning_rate": 9.827964403199065e-07, "loss": 5.9373676776885986e-05, "reward": 0.8999999761581421, "reward_std": 0.2828426957130432, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 468.6875, "completions/min_length": 433.0, "epoch": 3.5088235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8401203751564026, "kl": 0.00599130685441196, "learning_rate": 9.82763050329974e-07, "loss": 6.001919973641634e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 458.1875, "completions/min_length": 382.0, "epoch": 3.510294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.97096848487854, "kl": 0.005165204638615251, "learning_rate": 9.827296285368334e-07, "loss": 5.1759183406829834e-05, "reward": 0.7250000238418579, "reward_std": 0.22834810614585876, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 2387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 450.5, "completions/min_length": 412.0, "epoch": 3.511764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0511503219604492, "kl": 0.005313961999490857, "learning_rate": 9.826961749426865e-07, "loss": 5.3513795137405396e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 2388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 456.5, "completions/min_length": 368.0, "epoch": 3.513235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9507404565811157, "kl": 0.006154132774099708, "learning_rate": 9.82662689549737e-07, "loss": 6.115281576057896e-05, "reward": 0.809249997138977, "reward_std": 0.20610469579696655, "rewards/DrugCombAccuracyCOTORM/mean": 0.7654687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.42317667603492737, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.08539126068353653, "step": 2389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 469.625, "completions/min_length": 389.0, "epoch": 3.514705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.084870457649231, "kl": 0.004743744328152388, "learning_rate": 9.826291723601913e-07, "loss": 4.738569259643555e-05, "reward": 0.887499988079071, "reward_std": 0.21001701056957245, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 446.5, "completions/min_length": 401.0, "epoch": 3.5161764705882352, "frac_reward_zero_std": 1.0, "grad_norm": 0.012648122385144234, "kl": 0.005300243967212737, "learning_rate": 9.825956233762571e-07, "loss": 5.289862383506261e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/mean_length": 475.3125, "completions/min_length": 402.0, "epoch": 3.5176470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 1.0956614017486572, "kl": 0.021801774972118437, "learning_rate": 9.825620426001443e-07, "loss": 0.00021611154079437256, "reward": 0.987333357334137, "reward_std": 0.03582672402262688, "rewards/DrugCombAccuracyCOTORM/mean": 0.98416668176651, "rewards/DrugCombAccuracyCOTORM/std": 0.06333333253860474, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/mean_length": 528.75, "completions/min_length": 443.0, "epoch": 3.5191176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 1.627650260925293, "kl": 0.007027611951343715, "learning_rate": 9.825284300340657e-07, "loss": 6.961822509765625e-05, "reward": 0.6797499656677246, "reward_std": 0.23046213388442993, "rewards/DrugCombAccuracyCOTORM/mean": 0.6179167032241821, "rewards/DrugCombAccuracyCOTORM/std": 0.39864006638526917, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.291070818901062, "step": 2393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 431.0, "completions/min_length": 379.0, "epoch": 3.5205882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.008255855180323124, "kl": 0.0040719390963204205, "learning_rate": 9.82494785680235e-07, "loss": 4.096242628293112e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/mean_length": 505.875, "completions/min_length": 465.0, "epoch": 3.5220588235294117, "frac_reward_zero_std": 0.0, "grad_norm": 1.4215753078460693, "kl": 0.005344502336811274, "learning_rate": 9.824611095408688e-07, "loss": 5.348026752471924e-05, "reward": 0.831250011920929, "reward_std": 0.3589020371437073, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5439056158065796, "step": 2395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 452.9375, "completions/min_length": 392.0, "epoch": 3.523529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.0505067110061646, "kl": 0.005540514597669244, "learning_rate": 9.82427401618186e-07, "loss": 5.579279604717158e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 2396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 467.3125, "completions/min_length": 402.0, "epoch": 3.525, "frac_reward_zero_std": 1.0, "grad_norm": 0.025184396654367447, "kl": 0.005387527751736343, "learning_rate": 9.823936619144065e-07, "loss": 5.446594877867028e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 441.625, "completions/min_length": 383.0, "epoch": 3.526470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.017506062984466553, "kl": 0.006207255995832384, "learning_rate": 9.823598904317533e-07, "loss": 6.198106711963192e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/mean_length": 474.0, "completions/min_length": 389.0, "epoch": 3.527941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.03213983029127121, "kl": 0.00485173350898549, "learning_rate": 9.823260871724514e-07, "loss": 4.810336395166814e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/mean_length": 490.0625, "completions/min_length": 425.0, "epoch": 3.5294117647058822, "frac_reward_zero_std": 0.0, "grad_norm": 1.3038257360458374, "kl": 0.009858847130089998, "learning_rate": 9.822922521387276e-07, "loss": 9.898096323013306e-05, "reward": 0.5249999761581421, "reward_std": 0.3300992250442505, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.25, "rewards/DrugCombCoverageCOTORM/std": 0.8563488721847534, "step": 2400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 480.5, "completions/min_length": 404.0, "epoch": 3.5308823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.011771195568144321, "kl": 0.004789155093021691, "learning_rate": 9.822583853328105e-07, "loss": 4.8035260988399386e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 465.875, "completions/min_length": 426.0, "epoch": 3.5323529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.008847861550748348, "kl": 0.004752679145894945, "learning_rate": 9.822244867569315e-07, "loss": 4.762814933201298e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 2402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 459.125, "completions/min_length": 397.0, "epoch": 3.5338235294117646, "frac_reward_zero_std": 0.0, "grad_norm": 1.3794004917144775, "kl": 0.00477040302939713, "learning_rate": 9.821905564133236e-07, "loss": 4.7713518142700195e-05, "reward": 0.5199999809265137, "reward_std": 0.43178725242614746, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.699999988079071, "rewards/DrugCombCoverageCOTORM/std": 0.6693280339241028, "step": 2403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 466.8125, "completions/min_length": 403.0, "epoch": 3.5352941176470587, "frac_reward_zero_std": 0.5, "grad_norm": 1.0106250047683716, "kl": 0.004387503664474934, "learning_rate": 9.821565943042225e-07, "loss": 4.3987522076349705e-05, "reward": 0.9335833787918091, "reward_std": 0.11543041467666626, "rewards/DrugCombAccuracyCOTORM/mean": 0.9195833206176758, "rewards/DrugCombAccuracyCOTORM/std": 0.20547279715538025, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 2404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/mean_length": 493.8125, "completions/min_length": 405.0, "epoch": 3.536764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.6739140748977661, "kl": 0.00874116609338671, "learning_rate": 9.821226004318647e-07, "loss": 8.592754602432251e-05, "reward": 0.5750000476837158, "reward_std": 0.45434409379959106, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 2405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 413.25, "completions/min_length": 357.0, "epoch": 3.538235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.012727422639727592, "kl": 0.0047954669571481645, "learning_rate": 9.820885747984905e-07, "loss": 4.817801163881086e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 415.1875, "completions/min_length": 331.0, "epoch": 3.539705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.0607198476791382, "kl": 0.004539391724392772, "learning_rate": 9.820545174063405e-07, "loss": 4.558265209197998e-05, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 2407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 437.1875, "completions/min_length": 373.0, "epoch": 3.541176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9422392845153809, "kl": 0.005986552452668548, "learning_rate": 9.820204282576592e-07, "loss": 6.1026232287986204e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 463.9375, "completions/min_length": 385.0, "epoch": 3.5426470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.01587674394249916, "kl": 0.0044699934660457075, "learning_rate": 9.819863073546917e-07, "loss": 4.48792998213321e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 462.5625, "completions/min_length": 381.0, "epoch": 3.5441176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.02385644242167473, "kl": 0.005947955884039402, "learning_rate": 9.819521546996862e-07, "loss": 6.052126991562545e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 477.9375, "completions/min_length": 413.0, "epoch": 3.5455882352941175, "frac_reward_zero_std": 0.5, "grad_norm": 0.9775594472885132, "kl": 0.005784139269962907, "learning_rate": 9.819179702948922e-07, "loss": 5.832358874613419e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/mean_length": 522.6875, "completions/min_length": 477.0, "epoch": 3.5470588235294116, "frac_reward_zero_std": 0.5, "grad_norm": 0.8529601097106934, "kl": 0.005071106541436166, "learning_rate": 9.818837541425622e-07, "loss": 5.0790498789865524e-05, "reward": 0.8677083253860474, "reward_std": 0.1752796769142151, "rewards/DrugCombAccuracyCOTORM/mean": 0.8541666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.291070818901062, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.5072392821311951, "step": 2412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 452.9375, "completions/min_length": 384.0, "epoch": 3.548529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.011970640160143375, "kl": 0.004752662964165211, "learning_rate": 9.8184950624495e-07, "loss": 4.758678551297635e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 458.25, "completions/min_length": 406.0, "epoch": 3.55, "frac_reward_zero_std": 0.5, "grad_norm": 1.0260109901428223, "kl": 0.005128863384015858, "learning_rate": 9.818152266043115e-07, "loss": 5.1330775022506714e-05, "reward": 0.737500011920929, "reward_std": 0.219983771443367, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 429.1875, "completions/min_length": 380.0, "epoch": 3.5514705882352944, "frac_reward_zero_std": 1.0, "grad_norm": 0.03090272657573223, "kl": 0.008075354970060289, "learning_rate": 9.817809152229054e-07, "loss": 8.054101635934785e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 378.4375, "completions/min_length": 293.0, "epoch": 3.552941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.010765238665044308, "kl": 0.005562039208598435, "learning_rate": 9.817465721029916e-07, "loss": 5.513513315236196e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 498.5, "completions/min_length": 405.0, "epoch": 3.5544117647058826, "frac_reward_zero_std": 0.0, "grad_norm": 1.4372285604476929, "kl": 0.005058903363533318, "learning_rate": 9.817121972468328e-07, "loss": 5.014985799789429e-05, "reward": 0.6875, "reward_std": 0.42211851477622986, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 2417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 536.5, "completions/min_length": 463.0, "epoch": 3.5558823529411763, "frac_reward_zero_std": 0.5, "grad_norm": 1.0330147743225098, "kl": 0.005334388697519898, "learning_rate": 9.816777906566936e-07, "loss": 5.28824093635194e-05, "reward": 0.680622935295105, "reward_std": 0.018002191558480263, "rewards/DrugCombAccuracyCOTORM/mean": 0.6184218525886536, "rewards/DrugCombAccuracyCOTORM/std": 0.3947225511074066, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8588541746139526, "rewards/DrugCombCoverageCOTORM/std": 0.16067378222942352, "step": 2418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 482.0625, "completions/min_length": 420.0, "epoch": 3.557352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.100029706954956, "kl": 0.008369144634343684, "learning_rate": 9.816433523348406e-07, "loss": 8.397921919822693e-05, "reward": 0.8108124732971191, "reward_std": 0.20422221720218658, "rewards/DrugCombAccuracyCOTORM/mean": 0.7654687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.42317667603492737, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.0625, "step": 2419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 436.375, "completions/min_length": 383.0, "epoch": 3.5588235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.007755919825285673, "kl": 0.003856246708892286, "learning_rate": 9.816088822835422e-07, "loss": 3.880980148096569e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 423.5, "completions/min_length": 348.0, "epoch": 3.560294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.017077522352337837, "kl": 0.0051476622465997934, "learning_rate": 9.815743805050695e-07, "loss": 5.015583155909553e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 468.4375, "completions/min_length": 372.0, "epoch": 3.5617647058823527, "frac_reward_zero_std": 0.0, "grad_norm": 1.5541892051696777, "kl": 0.006462085992097855, "learning_rate": 9.815398470016956e-07, "loss": 6.470084190368652e-05, "reward": 0.5874999761581421, "reward_std": 0.3934735357761383, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 2422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 413.5625, "completions/min_length": 342.0, "epoch": 3.5632352941176473, "frac_reward_zero_std": 1.0, "grad_norm": 0.012664086185395718, "kl": 0.005787419097032398, "learning_rate": 9.81505281775695e-07, "loss": 5.8034049288835377e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 480.875, "completions/min_length": 432.0, "epoch": 3.564705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.129441499710083, "kl": 0.010474179172888398, "learning_rate": 9.81470684829345e-07, "loss": 0.0001036226749420166, "reward": 0.9551249742507935, "reward_std": 0.12692566215991974, "rewards/DrugCombAccuracyCOTORM/mean": 0.9478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.20874999463558197, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 2424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 435.8125, "completions/min_length": 381.0, "epoch": 3.5661764705882355, "frac_reward_zero_std": 0.5, "grad_norm": 1.0577056407928467, "kl": 0.00606298161437735, "learning_rate": 9.814360561649247e-07, "loss": 6.023743117111735e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/mean_length": 499.375, "completions/min_length": 438.0, "epoch": 3.567647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.9403548836708069, "kl": 0.006100208382122219, "learning_rate": 9.814013957847154e-07, "loss": 6.110966205596924e-05, "reward": 0.9725833535194397, "reward_std": 0.05076579004526138, "rewards/DrugCombAccuracyCOTORM/mean": 0.96833336353302, "rewards/DrugCombAccuracyCOTORM/std": 0.08652980625629425, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.05692751333117485, "step": 2426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 457.0, "completions/min_length": 403.0, "epoch": 3.5691176470588237, "frac_reward_zero_std": 1.0, "grad_norm": 0.01612735353410244, "kl": 0.004514742875471711, "learning_rate": 9.813667036910004e-07, "loss": 4.4896140025230125e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 447.625, "completions/min_length": 395.0, "epoch": 3.5705882352941174, "frac_reward_zero_std": 1.0, "grad_norm": 0.013117448426783085, "kl": 0.00542585679795593, "learning_rate": 9.813319798860652e-07, "loss": 5.443694317364134e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 408.1875, "completions/min_length": 337.0, "epoch": 3.572058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.006379533559083939, "kl": 0.004203279968351126, "learning_rate": 9.812972243721972e-07, "loss": 4.168512168689631e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/mean_length": 515.25, "completions/min_length": 411.0, "epoch": 3.5735294117647056, "frac_reward_zero_std": 0.5, "grad_norm": 1.0888776779174805, "kl": 0.00599419919308275, "learning_rate": 9.81262437151686e-07, "loss": 6.00665807723999e-05, "reward": 0.5864583253860474, "reward_std": 0.03505593538284302, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8645833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.49895724654197693, "step": 2430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/mean_length": 408.375, "completions/min_length": 353.0, "epoch": 3.575, "frac_reward_zero_std": 0.0, "grad_norm": 1.315551519393921, "kl": 0.006168952560983598, "learning_rate": 9.812276182268236e-07, "loss": 6.155669689178467e-05, "reward": 0.762499988079071, "reward_std": 0.4397645592689514, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.7187952995300293, "step": 2431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/mean_length": 480.0625, "completions/min_length": 420.0, "epoch": 3.576470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9751654863357544, "kl": 0.004497423185966909, "learning_rate": 9.811927675999034e-07, "loss": 4.51207160949707e-05, "reward": 0.906833291053772, "reward_std": 0.17410698533058167, "rewards/DrugCombAccuracyCOTORM/mean": 0.8887500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.30663496255874634, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 2432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/mean_length": 486.875, "completions/min_length": 400.0, "epoch": 3.5779411764705884, "frac_reward_zero_std": 0.5, "grad_norm": 0.9369790554046631, "kl": 0.0050355648854747415, "learning_rate": 9.811578852732213e-07, "loss": 5.0110706069972366e-05, "reward": 0.16250000894069672, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.125, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.375, "rewards/DrugCombCoverageCOTORM/std": 0.7187952995300293, "step": 2433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 443.125, "completions/min_length": 403.0, "epoch": 3.5794117647058825, "frac_reward_zero_std": 1.0, "grad_norm": 0.014562280848622322, "kl": 0.0052365666488185525, "learning_rate": 9.811229712490756e-07, "loss": 5.204425542615354e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/mean_length": 507.3125, "completions/min_length": 435.0, "epoch": 3.5808823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 1.100225567817688, "kl": 0.006809527054429054, "learning_rate": 9.81088025529766e-07, "loss": 6.805360317230225e-05, "reward": 0.9056999683380127, "reward_std": 0.17460967600345612, "rewards/DrugCombAccuracyCOTORM/mean": 0.8914999961853027, "rewards/DrugCombAccuracyCOTORM/std": 0.2964784502983093, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.925000011920929, "rewards/DrugCombCoverageCOTORM/std": 0.20493900775909424, "step": 2435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 460.8125, "completions/min_length": 396.0, "epoch": 3.5823529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.011235085316002369, "kl": 0.004927125235553831, "learning_rate": 9.81053048117595e-07, "loss": 4.904436718788929e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 437.25, "completions/min_length": 397.0, "epoch": 3.583823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.028253596276044846, "kl": 0.006936219986528158, "learning_rate": 9.810180390148664e-07, "loss": 7.00631644576788e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 438.75, "completions/min_length": 328.0, "epoch": 3.585294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9548561573028564, "kl": 0.0046408476191572845, "learning_rate": 9.809829982238866e-07, "loss": 4.665553569793701e-05, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 2438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 487.4375, "completions/min_length": 435.0, "epoch": 3.586764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.4118399620056152, "kl": 0.008497418253682554, "learning_rate": 9.809479257469642e-07, "loss": 8.490326581522822e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 455.875, "completions/min_length": 409.0, "epoch": 3.588235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1063288450241089, "kl": 0.006944374064914882, "learning_rate": 9.809128215864096e-07, "loss": 6.970018148422241e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 424.875, "completions/min_length": 328.0, "epoch": 3.5897058823529413, "frac_reward_zero_std": 1.0, "grad_norm": 0.013617017306387424, "kl": 0.005309756146743894, "learning_rate": 9.80877685744535e-07, "loss": 5.3372419642983004e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/mean_length": 573.0, "completions/min_length": 396.0, "epoch": 3.5911764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 1.4478904008865356, "kl": 0.0052803485887125134, "learning_rate": 9.80842518223656e-07, "loss": 5.323418736224994e-05, "reward": 0.5765625238418579, "reward_std": 0.015843382105231285, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.78125, "rewards/DrugCombCoverageCOTORM/std": 0.2747754454612732, "step": 2442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 463.4375, "completions/min_length": 406.0, "epoch": 3.5926470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.023235835134983063, "kl": 0.005409683391917497, "learning_rate": 9.808073190260884e-07, "loss": 5.37217638338916e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 476.0, "completions/min_length": 429.0, "epoch": 3.5941176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.8444397449493408, "kl": 0.006073761032894254, "learning_rate": 9.807720881541515e-07, "loss": 6.057322025299072e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 474.9375, "completions/min_length": 430.0, "epoch": 3.5955882352941178, "frac_reward_zero_std": 0.0, "grad_norm": 1.406766653060913, "kl": 0.006923072156496346, "learning_rate": 9.80736825610166e-07, "loss": 6.873160600662231e-05, "reward": 0.75, "reward_std": 0.39218366146087646, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 464.6875, "completions/min_length": 436.0, "epoch": 3.597058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.013088063336908817, "kl": 0.005922868498601019, "learning_rate": 9.807015313964552e-07, "loss": 5.934040018473752e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/mean_length": 476.1875, "completions/min_length": 427.0, "epoch": 3.598529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.2340692281723022, "kl": 0.005701541900634766, "learning_rate": 9.806662055153438e-07, "loss": 5.6585296988487244e-05, "reward": 0.8384663462638855, "reward_std": 0.025185899809002876, "rewards/DrugCombAccuracyCOTORM/mean": 0.8148797750473022, "rewards/DrugCombAccuracyCOTORM/std": 0.19693107903003693, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8656250238418579, "rewards/DrugCombCoverageCOTORM/std": 0.1468772292137146, "step": 2447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 483.375, "completions/min_length": 435.0, "epoch": 3.6, "frac_reward_zero_std": 0.5, "grad_norm": 1.0495599508285522, "kl": 0.006032144068740308, "learning_rate": 9.806308479691594e-07, "loss": 6.0178685089340433e-05, "reward": 0.9178333282470703, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.9025000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.26642072200775146, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 2448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 447.5625, "completions/min_length": 419.0, "epoch": 3.601470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.007825271226465702, "kl": 0.003842582751531154, "learning_rate": 9.80595458760231e-07, "loss": 3.869567080982961e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 458.1875, "completions/min_length": 403.0, "epoch": 3.6029411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.8381192088127136, "kl": 0.005030465195886791, "learning_rate": 9.8056003789089e-07, "loss": 5.014895577915013e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 2450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2049.0, "completions/mean_length": 559.0625, "completions/min_length": 406.0, "epoch": 3.6044117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.0773532390594482, "kl": 0.005305589642375708, "learning_rate": 9.805245853634698e-07, "loss": 5.62816858291626e-05, "reward": 0.7906249761581421, "reward_std": 0.22517356276512146, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 0.9375, "rewards/DrugCombCOTFormatORM/std": 0.25, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 424.375, "completions/min_length": 381.0, "epoch": 3.6058823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.015275112353265285, "kl": 0.00540760881267488, "learning_rate": 9.804891011803061e-07, "loss": 5.3555952035821974e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/mean_length": 515.3125, "completions/min_length": 409.0, "epoch": 3.6073529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 0.9330654740333557, "kl": 0.004458360257558525, "learning_rate": 9.804535853437362e-07, "loss": 4.4270433136262e-05, "reward": 0.875, "reward_std": 0.1035098284482956, "rewards/DrugCombAccuracyCOTORM/mean": 0.84375, "rewards/DrugCombAccuracyCOTORM/std": 0.23935678601264954, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 448.5, "completions/min_length": 397.0, "epoch": 3.6088235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.020089365541934967, "kl": 0.0058313284534960985, "learning_rate": 9.804180378561e-07, "loss": 5.8853380323853344e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 530.5625, "completions/min_length": 489.0, "epoch": 3.610294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.7646157145500183, "kl": 0.005680157337337732, "learning_rate": 9.803824587197394e-07, "loss": 5.6862831115722656e-05, "reward": 0.637499988079071, "reward_std": 0.1482035368680954, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 2455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 408.1875, "completions/min_length": 373.0, "epoch": 3.611764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.008168919011950493, "kl": 0.004805475473403931, "learning_rate": 9.80346847936998e-07, "loss": 4.809032179764472e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 456.5625, "completions/min_length": 320.0, "epoch": 3.613235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0375986099243164, "kl": 0.0060472863260656595, "learning_rate": 9.80311205510222e-07, "loss": 6.092163675930351e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 433.3125, "completions/min_length": 402.0, "epoch": 3.614705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.012778276577591896, "kl": 0.0038552034529857337, "learning_rate": 9.80275531441759e-07, "loss": 3.875280526699498e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 432.6875, "completions/min_length": 398.0, "epoch": 3.6161764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.021774137392640114, "kl": 0.0057247335789725184, "learning_rate": 9.802398257339597e-07, "loss": 5.743365181842819e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/mean_length": 407.375, "completions/min_length": 341.0, "epoch": 3.6176470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.1427256017923355, "kl": 0.009088801802136004, "learning_rate": 9.80204088389176e-07, "loss": 8.940778207033873e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 458.125, "completions/min_length": 383.0, "epoch": 3.6191176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.009781789034605026, "kl": 0.005593826994299889, "learning_rate": 9.80168319409762e-07, "loss": 5.6499884522054344e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 521.6875, "completions/min_length": 476.0, "epoch": 3.6205882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.8327195048332214, "kl": 0.004785986966453493, "learning_rate": 9.801325187980744e-07, "loss": 4.757195711135864e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 410.8125, "completions/min_length": 362.0, "epoch": 3.6220588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.3586862087249756, "kl": 0.010156157659366727, "learning_rate": 9.800966865564716e-07, "loss": 9.918252908391878e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/mean_length": 485.4375, "completions/min_length": 384.0, "epoch": 3.623529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.871932864189148, "kl": 0.004393414536025375, "learning_rate": 9.80060822687314e-07, "loss": 4.3759795516962186e-05, "reward": 0.65625, "reward_std": 0.21286731958389282, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.6291528940200806, "step": 2464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 421.625, "completions/min_length": 388.0, "epoch": 3.625, "frac_reward_zero_std": 1.0, "grad_norm": 0.009683740325272083, "kl": 0.005291153909638524, "learning_rate": 9.800249271929643e-07, "loss": 5.299217446008697e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 427.625, "completions/min_length": 362.0, "epoch": 3.626470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.01587100885808468, "kl": 0.005338166258297861, "learning_rate": 9.79989000075787e-07, "loss": 5.3538511565420777e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 443.875, "completions/min_length": 369.0, "epoch": 3.6279411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9378043413162231, "kl": 0.005298180738463998, "learning_rate": 9.799530413381495e-07, "loss": 5.3510069847106934e-05, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 2467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 453.75, "completions/min_length": 359.0, "epoch": 3.6294117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.2193470001220703, "kl": 0.006026301882229745, "learning_rate": 9.799170509824199e-07, "loss": 5.992746082483791e-05, "reward": 0.778083324432373, "reward_std": 0.1540384292602539, "rewards/DrugCombAccuracyCOTORM/mean": 0.7343229055404663, "rewards/DrugCombAccuracyCOTORM/std": 0.3792010545730591, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 2468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 485.875, "completions/min_length": 418.0, "epoch": 3.6308823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.9072219133377075, "kl": 0.005548821529373527, "learning_rate": 9.798810290109695e-07, "loss": 5.550346395466477e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/mean_length": 443.0625, "completions/min_length": 397.0, "epoch": 3.6323529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8004194498062134, "kl": 0.0060786542017012835, "learning_rate": 9.798449754261714e-07, "loss": 6.137885065982118e-05, "reward": 0.512499988079071, "reward_std": 0.0353553369641304, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 2470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 492.3125, "completions/min_length": 450.0, "epoch": 3.6338235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.018501048907637596, "kl": 0.005566361942328513, "learning_rate": 9.798088902304008e-07, "loss": 5.583099846262485e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 462.8125, "completions/min_length": 372.0, "epoch": 3.635294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.008970255963504314, "kl": 0.004403113096486777, "learning_rate": 9.797727734260346e-07, "loss": 4.36284244642593e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 454.75, "completions/min_length": 402.0, "epoch": 3.636764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.02015446126461029, "kl": 0.007247063564136624, "learning_rate": 9.797366250154523e-07, "loss": 7.268888293765485e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/mean_length": 523.625, "completions/min_length": 444.0, "epoch": 3.638235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1066449880599976, "kl": 0.005679239751771092, "learning_rate": 9.79700445001035e-07, "loss": 5.6806104112183675e-05, "reward": 0.8544270992279053, "reward_std": 0.1273023933172226, "rewards/DrugCombAccuracyCOTORM/mean": 0.8229166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.28198206424713135, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9609375, "rewards/DrugCombCoverageCOTORM/std": 0.059839196503162384, "step": 2474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 427.0, "completions/min_length": 384.0, "epoch": 3.639705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.030101682990789413, "kl": 0.005475857411511242, "learning_rate": 9.796642333851667e-07, "loss": 5.4775642638560385e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 501.8125, "completions/min_length": 439.0, "epoch": 3.6411764705882352, "frac_reward_zero_std": 0.5, "grad_norm": 0.8299795985221863, "kl": 0.004283186397515237, "learning_rate": 9.796279901702325e-07, "loss": 4.318356513977051e-05, "reward": 0.7983333468437195, "reward_std": 0.0784134492278099, "rewards/DrugCombAccuracyCOTORM/mean": 0.7739583253860474, "rewards/DrugCombAccuracyCOTORM/std": 0.2716539800167084, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.23174059391021729, "step": 2476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 450.0625, "completions/min_length": 403.0, "epoch": 3.6426470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.013340037316083908, "kl": 0.005511232768185437, "learning_rate": 9.7959171535862e-07, "loss": 5.515060911420733e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 491.5, "completions/min_length": 436.0, "epoch": 3.6441176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.8827330470085144, "kl": 0.005096095963381231, "learning_rate": 9.795554089527189e-07, "loss": 5.1015558710787445e-05, "reward": 0.75, "reward_std": 0.20701967179775238, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 422.125, "completions/min_length": 366.0, "epoch": 3.6455882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.014232530258595943, "kl": 0.005734928650781512, "learning_rate": 9.795190709549213e-07, "loss": 5.7059725804720074e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 459.25, "completions/min_length": 405.0, "epoch": 3.6470588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.023989608511328697, "kl": 0.005027649342082441, "learning_rate": 9.794827013676205e-07, "loss": 5.05999632878229e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 410.625, "completions/min_length": 344.0, "epoch": 3.648529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.00982921477407217, "kl": 0.0048920317203737795, "learning_rate": 9.79446300193213e-07, "loss": 4.861329944105819e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 438.3125, "completions/min_length": 399.0, "epoch": 3.65, "frac_reward_zero_std": 1.0, "grad_norm": 0.05399973317980766, "kl": 0.007064453209750354, "learning_rate": 9.794098674340966e-07, "loss": 7.02100369380787e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/mean_length": 517.9375, "completions/min_length": 465.0, "epoch": 3.651470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.052369236946106, "kl": 0.005479976010974497, "learning_rate": 9.793734030926712e-07, "loss": 5.511488416232169e-05, "reward": 0.707687497138977, "reward_std": 0.18323364853858948, "rewards/DrugCombAccuracyCOTORM/mean": 0.6404687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.48291152715682983, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.953125, "rewards/DrugCombCoverageCOTORM/std": 0.10077822208404541, "step": 2483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/mean_length": 403.8125, "completions/min_length": 362.0, "epoch": 3.652941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.01655537821352482, "kl": 0.00528562004910782, "learning_rate": 9.793369071713391e-07, "loss": 5.247683293418959e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/mean_length": 425.375, "completions/min_length": 386.0, "epoch": 3.6544117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.01113172434270382, "kl": 0.00510839163325727, "learning_rate": 9.793003796725049e-07, "loss": 5.0886352255474776e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 430.3125, "completions/min_length": 391.0, "epoch": 3.6558823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.011215679347515106, "kl": 0.005682233138941228, "learning_rate": 9.792638205985744e-07, "loss": 5.675202191923745e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 426.125, "completions/min_length": 354.0, "epoch": 3.6573529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.022374024614691734, "kl": 0.006390394177287817, "learning_rate": 9.792272299519563e-07, "loss": 6.355953519232571e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 391.0625, "completions/min_length": 305.0, "epoch": 3.6588235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.009840973652899265, "kl": 0.00504322734195739, "learning_rate": 9.791906077350611e-07, "loss": 5.0725298933684826e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 431.1875, "completions/min_length": 369.0, "epoch": 3.6602941176470587, "frac_reward_zero_std": 0.5, "grad_norm": 0.9501087069511414, "kl": 0.005486131296493113, "learning_rate": 9.791539539503013e-07, "loss": 5.5029988288879395e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 2489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 434.5625, "completions/min_length": 389.0, "epoch": 3.661764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.1368756294250488, "kl": 0.006793597829528153, "learning_rate": 9.791172686000918e-07, "loss": 6.774549547117203e-05, "reward": 0.9089166522026062, "reward_std": 0.16972768306732178, "rewards/DrugCombAccuracyCOTORM/mean": 0.8887500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.30663496255874634, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 2490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 399.6875, "completions/min_length": 351.0, "epoch": 3.663235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.02912505343556404, "kl": 0.007296328549273312, "learning_rate": 9.790805516868488e-07, "loss": 7.31258187443018e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 474.5, "completions/min_length": 397.0, "epoch": 3.664705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.06080303713679314, "kl": 0.007921882614027709, "learning_rate": 9.790438032129918e-07, "loss": 7.946604455355555e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 445.5625, "completions/min_length": 397.0, "epoch": 3.666176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 1.249480962753296, "kl": 0.005853386712260544, "learning_rate": 9.79007023180941e-07, "loss": 5.847960710525513e-05, "reward": 0.8999999761581421, "reward_std": 0.2828426957130432, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 422.75, "completions/min_length": 387.0, "epoch": 3.6676470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 1.2276920080184937, "kl": 0.006112587871029973, "learning_rate": 9.7897021159312e-07, "loss": 6.098300218582153e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 432.125, "completions/min_length": 400.0, "epoch": 3.6691176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.012478139251470566, "kl": 0.004866997478529811, "learning_rate": 9.789333684519535e-07, "loss": 4.905187961412594e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/mean_length": 562.6875, "completions/min_length": 471.0, "epoch": 3.6705882352941175, "frac_reward_zero_std": 0.0, "grad_norm": 1.2478306293487549, "kl": 0.00542878988198936, "learning_rate": 9.788964937598688e-07, "loss": 5.4251402616500854e-05, "reward": 0.6776666641235352, "reward_std": 0.3616371750831604, "rewards/DrugCombAccuracyCOTORM/mean": 0.6231250166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.4182617664337158, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5288001894950867, "step": 2496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 493.75, "completions/min_length": 446.0, "epoch": 3.6720588235294116, "frac_reward_zero_std": 0.5, "grad_norm": 0.800150454044342, "kl": 0.004662852385081351, "learning_rate": 9.788595875192948e-07, "loss": 4.672650175052695e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/mean_length": 444.625, "completions/min_length": 348.0, "epoch": 3.673529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9735099673271179, "kl": 0.004630238865502179, "learning_rate": 9.788226497326633e-07, "loss": 4.523410461843014e-05, "reward": 0.7610833644866943, "reward_std": 0.1480911374092102, "rewards/DrugCombAccuracyCOTORM/mean": 0.7143750190734863, "rewards/DrugCombAccuracyCOTORM/std": 0.3812073767185211, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.14801150560379028, "step": 2498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 475.625, "completions/min_length": 413.0, "epoch": 3.675, "frac_reward_zero_std": 1.0, "grad_norm": 0.007722535636276007, "kl": 0.004135343013331294, "learning_rate": 9.787856804024071e-07, "loss": 4.1247811168432236e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 463.0, "completions/min_length": 402.0, "epoch": 3.6764705882352944, "frac_reward_zero_std": 0.5, "grad_norm": 1.1765830516815186, "kl": 0.005996696068905294, "learning_rate": 9.78748679530962e-07, "loss": 6.009286880725995e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/mean_length": 481.125, "completions/min_length": 400.0, "epoch": 3.677941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.4402196407318115, "kl": 0.01003135705832392, "learning_rate": 9.787116471207658e-07, "loss": 9.909272193908691e-05, "reward": 0.8666666746139526, "reward_std": 0.24688534438610077, "rewards/DrugCombAccuracyCOTORM/mean": 0.8333333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.3442651927471161, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/mean_length": 513.25, "completions/min_length": 450.0, "epoch": 3.6794117647058826, "frac_reward_zero_std": 0.0, "grad_norm": 1.2341315746307373, "kl": 0.004530992533545941, "learning_rate": 9.786745831742574e-07, "loss": 4.4852495193481445e-05, "reward": 0.8426250219345093, "reward_std": 0.3479189872741699, "rewards/DrugCombAccuracyCOTORM/mean": 0.8228124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.38252657651901245, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.3520771861076355, "step": 2502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 444.0, "completions/min_length": 376.0, "epoch": 3.6808823529411763, "frac_reward_zero_std": 0.5, "grad_norm": 1.1657518148422241, "kl": 0.005457671475596726, "learning_rate": 9.78637487693879e-07, "loss": 5.455315113067627e-05, "reward": 0.7124999761581421, "reward_std": 0.24164614081382751, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 2503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 463.375, "completions/min_length": 427.0, "epoch": 3.682352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.009731143712997437, "kl": 0.00503399851731956, "learning_rate": 9.78600360682074e-07, "loss": 5.03224873682484e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/mean_length": 477.6875, "completions/min_length": 386.0, "epoch": 3.6838235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 1.347225546836853, "kl": 0.006324600777588785, "learning_rate": 9.785632021412884e-07, "loss": 6.272643804550171e-05, "reward": 0.5854166746139526, "reward_std": 0.03500283509492874, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5013870000839233, "step": 2505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/mean_length": 467.9375, "completions/min_length": 387.0, "epoch": 3.685294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.015377646312117577, "kl": 0.006110462360084057, "learning_rate": 9.785260120739705e-07, "loss": 6.14070740994066e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/mean_length": 528.9375, "completions/min_length": 437.0, "epoch": 3.6867647058823527, "frac_reward_zero_std": 0.0, "grad_norm": 1.3620737791061401, "kl": 0.006699709687381983, "learning_rate": 9.784887904825695e-07, "loss": 6.698817014694214e-05, "reward": 0.4046875238418579, "reward_std": 0.37359923124313354, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 2507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 475.5625, "completions/min_length": 412.0, "epoch": 3.6882352941176473, "frac_reward_zero_std": 0.0, "grad_norm": 1.811492681503296, "kl": 0.005346034653484821, "learning_rate": 9.784515373695381e-07, "loss": 5.3532421588897705e-05, "reward": 0.7552083730697632, "reward_std": 0.2965143918991089, "rewards/DrugCombAccuracyCOTORM/mean": 0.7291666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.4425306022167206, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.71875, "rewards/DrugCombCoverageCOTORM/std": 0.44604745507240295, "step": 2508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 442.875, "completions/min_length": 399.0, "epoch": 3.689705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 1.3377405405044556, "kl": 0.005425882991403341, "learning_rate": 9.784142527373302e-07, "loss": 5.402415990829468e-05, "reward": 0.6071250438690186, "reward_std": 0.40932103991508484, "rewards/DrugCombAccuracyCOTORM/mean": 0.5206249952316284, "rewards/DrugCombAccuracyCOTORM/std": 0.49783825874328613, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.20155644416809082, "step": 2509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/mean_length": 463.8125, "completions/min_length": 387.0, "epoch": 3.6911764705882355, "frac_reward_zero_std": 0.5, "grad_norm": 2.3115923404693604, "kl": 0.007631961954757571, "learning_rate": 9.783769365884022e-07, "loss": 7.687532342970371e-05, "reward": 0.5756042003631592, "reward_std": 0.03199255093932152, "rewards/DrugCombAccuracyCOTORM/mean": 0.5183333158493042, "rewards/DrugCombAccuracyCOTORM/std": 0.49962061643600464, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.609375, "rewards/DrugCombCoverageCOTORM/std": 0.42461174726486206, "step": 2510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/mean_length": 490.0625, "completions/min_length": 403.0, "epoch": 3.692647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.9078034162521362, "kl": 0.005242318962700665, "learning_rate": 9.78339588925212e-07, "loss": 5.2467217756202444e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 414.75, "completions/min_length": 346.0, "epoch": 3.6941176470588237, "frac_reward_zero_std": 0.5, "grad_norm": 1.0079174041748047, "kl": 0.005522532970644534, "learning_rate": 9.783022097502203e-07, "loss": 5.537968172575347e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 488.6875, "completions/min_length": 444.0, "epoch": 3.6955882352941174, "frac_reward_zero_std": 0.5, "grad_norm": 1.0477107763290405, "kl": 0.006513614091090858, "learning_rate": 9.782647990658893e-07, "loss": 6.509569357149303e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/mean_length": 545.3125, "completions/min_length": 433.0, "epoch": 3.697058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.08805251121521, "kl": 0.005095236352644861, "learning_rate": 9.78227356874684e-07, "loss": 5.091050115879625e-05, "reward": 0.7477869987487793, "reward_std": 0.023331418633461, "rewards/DrugCombAccuracyCOTORM/mean": 0.6916781663894653, "rewards/DrugCombAccuracyCOTORM/std": 0.32190951704978943, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9444444179534912, "rewards/DrugCombCoverageCOTORM/std": 0.09938079863786697, "step": 2514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/mean_length": 523.4375, "completions/min_length": 428.0, "epoch": 3.6985294117647056, "frac_reward_zero_std": 0.5, "grad_norm": 1.0078197717666626, "kl": 0.005305762693751603, "learning_rate": 9.781898831790704e-07, "loss": 5.3070587455295026e-05, "reward": 0.7583541870117188, "reward_std": 0.13371886312961578, "rewards/DrugCombAccuracyCOTORM/mean": 0.7077083587646484, "rewards/DrugCombAccuracyCOTORM/std": 0.37233129143714905, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.921875, "rewards/DrugCombCoverageCOTORM/std": 0.11967839300632477, "step": 2515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 411.4375, "completions/min_length": 373.0, "epoch": 3.7, "frac_reward_zero_std": 0.5, "grad_norm": 1.2195537090301514, "kl": 0.005517148529179394, "learning_rate": 9.781523779815178e-07, "loss": 5.529820919036865e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 456.0, "completions/min_length": 369.0, "epoch": 3.701470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.011101345531642437, "kl": 0.005570668610744178, "learning_rate": 9.78114841284496e-07, "loss": 5.550416244659573e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 446.75, "completions/min_length": 377.0, "epoch": 3.7029411764705884, "frac_reward_zero_std": 0.5, "grad_norm": 1.0141719579696655, "kl": 0.0071604110999032855, "learning_rate": 9.78077273090479e-07, "loss": 7.223451393656433e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 469.125, "completions/min_length": 421.0, "epoch": 3.7044117647058825, "frac_reward_zero_std": 1.0, "grad_norm": 0.015014506876468658, "kl": 0.005447830480989069, "learning_rate": 9.780396734019409e-07, "loss": 5.419059016276151e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 497.1875, "completions/min_length": 448.0, "epoch": 3.7058823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 0.9055768251419067, "kl": 0.004482300370000303, "learning_rate": 9.78002042221359e-07, "loss": 4.470421117730439e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 2520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 430.0, "completions/min_length": 386.0, "epoch": 3.7073529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.012346244417130947, "kl": 0.005162718240171671, "learning_rate": 9.779643795512118e-07, "loss": 5.10914069309365e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 423.125, "completions/min_length": 335.0, "epoch": 3.708823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.8655240535736084, "kl": 0.004557823878712952, "learning_rate": 9.779266853939812e-07, "loss": 4.546375566860661e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 409.8125, "completions/min_length": 348.0, "epoch": 3.710294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.5246397256851196, "kl": 0.00553719192976132, "learning_rate": 9.7788895975215e-07, "loss": 5.50001859664917e-05, "reward": 0.887499988079071, "reward_std": 0.21001701056957245, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 401.375, "completions/min_length": 305.0, "epoch": 3.711764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8897843956947327, "kl": 0.005193359334953129, "learning_rate": 9.778512026282035e-07, "loss": 5.1647424697875977e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 478.125, "completions/min_length": 430.0, "epoch": 3.713235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.481317400932312, "kl": 0.005620136158540845, "learning_rate": 9.778134140246291e-07, "loss": 5.601346492767334e-05, "reward": 0.550000011920929, "reward_std": 0.39218372106552124, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/mean_length": 501.4375, "completions/min_length": 394.0, "epoch": 3.7147058823529413, "frac_reward_zero_std": 0.5, "grad_norm": 1.0277760028839111, "kl": 0.008355220663361251, "learning_rate": 9.77775593943916e-07, "loss": 8.341670036315918e-05, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 2526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 492.5, "completions/min_length": 441.0, "epoch": 3.7161764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 1.2028957605361938, "kl": 0.005301199504174292, "learning_rate": 9.777377423885561e-07, "loss": 5.291774868965149e-05, "reward": 0.7945833206176758, "reward_std": 0.17010116577148438, "rewards/DrugCombAccuracyCOTORM/mean": 0.7562500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.3733965754508972, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.15957117080688477, "step": 2527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 456.3125, "completions/min_length": 403.0, "epoch": 3.7176470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.2437022924423218, "kl": 0.006111952476203442, "learning_rate": 9.776998593610427e-07, "loss": 6.092496187193319e-05, "reward": 0.574999988079071, "reward_std": 0.04629100486636162, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 2528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 456.5, "completions/min_length": 403.0, "epoch": 3.7191176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.8664073348045349, "kl": 0.004127703374251723, "learning_rate": 9.776619448638715e-07, "loss": 4.106615961063653e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 423.75, "completions/min_length": 369.0, "epoch": 3.7205882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.012277212925255299, "kl": 0.004975594230927527, "learning_rate": 9.776239988995399e-07, "loss": 4.957327473675832e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 483.3125, "completions/min_length": 406.0, "epoch": 3.722058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.9749083518981934, "kl": 0.004731508903205395, "learning_rate": 9.775860214705482e-07, "loss": 4.725903272628784e-05, "reward": 0.35362499952316284, "reward_std": 0.2115243524312973, "rewards/DrugCombAccuracyCOTORM/mean": 0.27406251430511475, "rewards/DrugCombAccuracyCOTORM/std": 0.4358639717102051, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.34375, "rewards/DrugCombCoverageCOTORM/std": 0.4366062581539154, "step": 2531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 488.75, "completions/min_length": 389.0, "epoch": 3.723529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 3.196712017059326, "kl": 0.0225731372484006, "learning_rate": 9.77548012579398e-07, "loss": 0.00021598604507744312, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/mean_length": 485.5625, "completions/min_length": 385.0, "epoch": 3.725, "frac_reward_zero_std": 0.5, "grad_norm": 1.0620626211166382, "kl": 0.006768610212020576, "learning_rate": 9.775099722285934e-07, "loss": 6.738871161360294e-05, "reward": 0.875, "reward_std": 0.1035098284482956, "rewards/DrugCombAccuracyCOTORM/mean": 0.84375, "rewards/DrugCombAccuracyCOTORM/std": 0.23935678601264954, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 451.0625, "completions/min_length": 383.0, "epoch": 3.726470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.017618656158447266, "kl": 0.006166839622892439, "learning_rate": 9.7747190042064e-07, "loss": 6.01420397288166e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 469.6875, "completions/min_length": 428.0, "epoch": 3.7279411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.0686466693878174, "kl": 0.006112162838689983, "learning_rate": 9.774337971580462e-07, "loss": 6.119403406046331e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 427.3125, "completions/min_length": 365.0, "epoch": 3.7294117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.2623112201690674, "kl": 0.006301094894297421, "learning_rate": 9.773956624433221e-07, "loss": 6.276369094848633e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 447.0625, "completions/min_length": 391.0, "epoch": 3.7308823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.013962875120341778, "kl": 0.0044895370956510305, "learning_rate": 9.7735749627898e-07, "loss": 4.4968041038373485e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 448.1875, "completions/min_length": 391.0, "epoch": 3.7323529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.015191994607448578, "kl": 0.005588082014583051, "learning_rate": 9.77319298667534e-07, "loss": 5.5336669902317226e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 479.5, "completions/min_length": 450.0, "epoch": 3.7338235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.015646222978830338, "kl": 0.0060916958609595895, "learning_rate": 9.772810696115004e-07, "loss": 6.078738806536421e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 476.4375, "completions/min_length": 407.0, "epoch": 3.735294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.8936718702316284, "kl": 0.00616204459220171, "learning_rate": 9.77242809113398e-07, "loss": 6.056805796106346e-05, "reward": 0.9941142797470093, "reward_std": 0.016647296026349068, "rewards/DrugCombAccuracyCOTORM/mean": 0.992642879486084, "rewards/DrugCombAccuracyCOTORM/std": 0.029428573325276375, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 417.1875, "completions/min_length": 381.0, "epoch": 3.736764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.007578388322144747, "kl": 0.004588878538925201, "learning_rate": 9.77204517175747e-07, "loss": 4.5924956793896854e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 421.875, "completions/min_length": 363.0, "epoch": 3.738235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0425001382827759, "kl": 0.003870664571877569, "learning_rate": 9.7716619380107e-07, "loss": 3.849714994430542e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 2542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 437.875, "completions/min_length": 396.0, "epoch": 3.739705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.019642964005470276, "kl": 0.006895252037793398, "learning_rate": 9.771278389918918e-07, "loss": 6.906664202688262e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 485.9375, "completions/min_length": 416.0, "epoch": 3.7411764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.4204102754592896, "kl": 0.005249578272923827, "learning_rate": 9.770894527507393e-07, "loss": 5.278736352920532e-05, "reward": 0.8421875238418579, "reward_std": 0.3486824631690979, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 442.4375, "completions/min_length": 370.0, "epoch": 3.7426470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.024889880791306496, "kl": 0.004732555011287332, "learning_rate": 9.770510350801406e-07, "loss": 4.793888365384191e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 473.0625, "completions/min_length": 423.0, "epoch": 3.7441176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.8236225843429565, "kl": 0.005480359774082899, "learning_rate": 9.77012585982627e-07, "loss": 5.482882261276245e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 439.6875, "completions/min_length": 392.0, "epoch": 3.7455882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.010603101924061775, "kl": 0.004340434272307903, "learning_rate": 9.769741054607316e-07, "loss": 4.362436811788939e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/mean_length": 545.0, "completions/min_length": 478.0, "epoch": 3.7470588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.8385363817214966, "kl": 0.006470845662988722, "learning_rate": 9.769355935169888e-07, "loss": 6.444379687309265e-05, "reward": 0.7642333507537842, "reward_std": 0.1975167840719223, "rewards/DrugCombAccuracyCOTORM/mean": 0.7136250138282776, "rewards/DrugCombAccuracyCOTORM/std": 0.44267624616622925, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9333333373069763, "rewards/DrugCombCoverageCOTORM/std": 0.14401644468307495, "step": 2548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 471.25, "completions/min_length": 402.0, "epoch": 3.748529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.1158515214920044, "kl": 0.006235121749341488, "learning_rate": 9.768970501539366e-07, "loss": 6.211176514625549e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 443.8125, "completions/min_length": 353.0, "epoch": 3.75, "frac_reward_zero_std": 1.0, "grad_norm": 0.010430770926177502, "kl": 0.0049352070200257, "learning_rate": 9.768584753741134e-07, "loss": 4.958492354489863e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 428.375, "completions/min_length": 387.0, "epoch": 3.751470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.015127074904739857, "kl": 0.00602483213879168, "learning_rate": 9.768198691800607e-07, "loss": 6.035029218764976e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 456.5625, "completions/min_length": 390.0, "epoch": 3.7529411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.1308659315109253, "kl": 0.005314514506608248, "learning_rate": 9.767812315743215e-07, "loss": 5.3416813898365945e-05, "reward": 0.574999988079071, "reward_std": 0.1752549111843109, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.25, "rewards/DrugCombCoverageCOTORM/std": 1.0, "step": 2552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 435.875, "completions/min_length": 395.0, "epoch": 3.7544117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.025634007528424263, "kl": 0.0066110590705648065, "learning_rate": 9.767425625594415e-07, "loss": 6.458268035203218e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/mean_length": 522.125, "completions/min_length": 407.0, "epoch": 3.7558823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.9020438194274902, "kl": 0.005974255385808647, "learning_rate": 9.76703862137968e-07, "loss": 5.964003503322601e-05, "reward": 0.6929908990859985, "reward_std": 0.12279372662305832, "rewards/DrugCombAccuracyCOTORM/mean": 0.6388949155807495, "rewards/DrugCombAccuracyCOTORM/std": 0.41361695528030396, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8187500238418579, "rewards/DrugCombCoverageCOTORM/std": 0.3500198721885681, "step": 2554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 430.875, "completions/min_length": 382.0, "epoch": 3.7573529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.2870084047317505, "kl": 0.007180056883953512, "learning_rate": 9.766651303124502e-07, "loss": 7.194280624389648e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 2555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 468.75, "completions/min_length": 400.0, "epoch": 3.7588235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.785844624042511, "kl": 0.0055356817319989204, "learning_rate": 9.766263670854401e-07, "loss": 5.547388718696311e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 2556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 436.4375, "completions/min_length": 398.0, "epoch": 3.760294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.012409523129463196, "kl": 0.006050536758266389, "learning_rate": 9.765875724594911e-07, "loss": 6.063986802473664e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 433.4375, "completions/min_length": 385.0, "epoch": 3.761764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.2069025039672852, "kl": 0.012778382981196046, "learning_rate": 9.76548746437159e-07, "loss": 0.00012137368321418762, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 2558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 440.1875, "completions/min_length": 398.0, "epoch": 3.763235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9460178017616272, "kl": 0.00636655418202281, "learning_rate": 9.765098890210014e-07, "loss": 6.393343210220337e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/mean_length": 493.125, "completions/min_length": 428.0, "epoch": 3.764705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.03360339254140854, "kl": 0.007300851633772254, "learning_rate": 9.764710002135782e-07, "loss": 7.288655615411699e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/mean_length": 508.5625, "completions/min_length": 421.0, "epoch": 3.7661764705882352, "frac_reward_zero_std": 0.5, "grad_norm": 0.9911452531814575, "kl": 0.010069677722640336, "learning_rate": 9.764320800174514e-07, "loss": 0.00010188961459789425, "reward": 0.921625018119812, "reward_std": 0.14512228965759277, "rewards/DrugCombAccuracyCOTORM/mean": 0.9059374928474426, "rewards/DrugCombAccuracyCOTORM/std": 0.25702768564224243, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.08539126068353653, "step": 2561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 457.25, "completions/min_length": 356.0, "epoch": 3.7676470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.01649237796664238, "kl": 0.005486211623065174, "learning_rate": 9.763931284351847e-07, "loss": 5.490459079737775e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 471.125, "completions/min_length": 393.0, "epoch": 3.7691176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.010744420811533928, "kl": 0.004619542742148042, "learning_rate": 9.763541454693444e-07, "loss": 4.57406640634872e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 473.9375, "completions/min_length": 404.0, "epoch": 3.7705882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 1.8936728239059448, "kl": 0.007770843571051955, "learning_rate": 9.763151311224986e-07, "loss": 7.690489292144775e-05, "reward": 0.6214166879653931, "reward_std": 0.46475428342819214, "rewards/DrugCombAccuracyCOTORM/mean": 0.5762500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.49902406334877014, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6041666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.8001735806465149, "step": 2564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 458.9375, "completions/min_length": 395.0, "epoch": 3.7720588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.8926375508308411, "kl": 0.005617997841909528, "learning_rate": 9.762760853972172e-07, "loss": 5.61252236366272e-05, "reward": 0.643750011920929, "reward_std": 0.1237436980009079, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/mean_length": 462.8125, "completions/min_length": 374.0, "epoch": 3.773529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.160595417022705, "kl": 0.005536468815989792, "learning_rate": 9.762370082960726e-07, "loss": 5.6043267250061035e-05, "reward": 0.5668333172798157, "reward_std": 0.0807471051812172, "rewards/DrugCombAccuracyCOTORM/mean": 0.5366666913032532, "rewards/DrugCombAccuracyCOTORM/std": 0.48374465107917786, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.8595865368843079, "step": 2566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 391.3125, "completions/min_length": 328.0, "epoch": 3.775, "frac_reward_zero_std": 1.0, "grad_norm": 0.011264679953455925, "kl": 0.004806034965440631, "learning_rate": 9.76197899821639e-07, "loss": 4.8002701078075916e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 455.8125, "completions/min_length": 382.0, "epoch": 3.776470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.0568524599075317, "kl": 0.006787873804569244, "learning_rate": 9.761587599764931e-07, "loss": 6.83204852975905e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/mean_length": 497.25, "completions/min_length": 409.0, "epoch": 3.777941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.07392511516809464, "kl": 0.007658745162189007, "learning_rate": 9.76119588763213e-07, "loss": 7.753792306175455e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 2569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 425.5625, "completions/min_length": 327.0, "epoch": 3.7794117647058822, "frac_reward_zero_std": 0.5, "grad_norm": 0.671849250793457, "kl": 0.0038641500868834555, "learning_rate": 9.760803861843794e-07, "loss": 3.904849290847778e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 457.125, "completions/min_length": 398.0, "epoch": 3.7808823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.02685016766190529, "kl": 0.007339541451074183, "learning_rate": 9.760411522425746e-07, "loss": 7.31733744032681e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 461.5625, "completions/min_length": 406.0, "epoch": 3.7823529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.9271036386489868, "kl": 0.005097792367450893, "learning_rate": 9.760018869403835e-07, "loss": 5.087482713861391e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 2572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 424.5, "completions/min_length": 387.0, "epoch": 3.7838235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.018813248723745346, "kl": 0.005728976568207145, "learning_rate": 9.759625902803928e-07, "loss": 5.7896715588867664e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 426.4375, "completions/min_length": 361.0, "epoch": 3.7852941176470587, "frac_reward_zero_std": 0.5, "grad_norm": 0.9995341300964355, "kl": 0.005237853154540062, "learning_rate": 9.75923262265191e-07, "loss": 5.270540714263916e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/mean_length": 510.875, "completions/min_length": 415.0, "epoch": 3.786764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9676865935325623, "kl": 0.005257553653791547, "learning_rate": 9.75883902897369e-07, "loss": 5.295872688293457e-05, "reward": 0.09375, "reward_std": 0.0176776684820652, "rewards/DrugCombAccuracyCOTORM/mean": 0.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.0625, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 425.6875, "completions/min_length": 379.0, "epoch": 3.788235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.0068415985442698, "kl": 0.00408618402434513, "learning_rate": 9.7584451217952e-07, "loss": 4.138466465519741e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/mean_length": 511.125, "completions/min_length": 384.0, "epoch": 3.789705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.5728643536567688, "kl": 0.004180996969807893, "learning_rate": 9.758050901142388e-07, "loss": 4.1918930946849287e-05, "reward": 0.8148000240325928, "reward_std": 0.1520114243030548, "rewards/DrugCombAccuracyCOTORM/mean": 0.7747499942779541, "rewards/DrugCombAccuracyCOTORM/std": 0.34941887855529785, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.949999988079071, "rewards/DrugCombCoverageCOTORM/std": 0.08944271504878998, "step": 2577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 470.125, "completions/min_length": 411.0, "epoch": 3.791176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.0069501386024057865, "kl": 0.004497513524256647, "learning_rate": 9.75765636704122e-07, "loss": 4.518661444308236e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/mean_length": 510.25, "completions/min_length": 450.0, "epoch": 3.7926470588235293, "frac_reward_zero_std": 0.0, "grad_norm": 1.3262912034988403, "kl": 0.00564153993036598, "learning_rate": 9.757261519517693e-07, "loss": 5.688518285751343e-05, "reward": 0.800000011920929, "reward_std": 0.3484410345554352, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 448.0, "completions/min_length": 401.0, "epoch": 3.7941176470588234, "frac_reward_zero_std": 0.5, "grad_norm": 1.0983854532241821, "kl": 0.005982680479064584, "learning_rate": 9.756866358597818e-07, "loss": 5.948171019554138e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 2580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/mean_length": 465.3125, "completions/min_length": 396.0, "epoch": 3.7955882352941175, "frac_reward_zero_std": 0.0, "grad_norm": 1.3759782314300537, "kl": 0.006631560390815139, "learning_rate": 9.756470884307624e-07, "loss": 6.571412086486816e-05, "reward": 0.5376042127609253, "reward_std": 0.23462410271167755, "rewards/DrugCombAccuracyCOTORM/mean": 0.48124998807907104, "rewards/DrugCombAccuracyCOTORM/std": 0.3986344635486603, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5416666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.4367387592792511, "step": 2581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 463.0, "completions/min_length": 369.0, "epoch": 3.7970588235294116, "frac_reward_zero_std": 0.5, "grad_norm": 1.0118803977966309, "kl": 0.006106080487370491, "learning_rate": 9.756075096673163e-07, "loss": 6.0935504734516144e-05, "reward": 0.5651666522026062, "reward_std": 0.050542138516902924, "rewards/DrugCombAccuracyCOTORM/mean": 0.5137500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.5050000548362732, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5416666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.6191391944885254, "step": 2582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 448.375, "completions/min_length": 369.0, "epoch": 3.798529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9463959336280823, "kl": 0.004221553273964673, "learning_rate": 9.755678995720515e-07, "loss": 4.2234336433466524e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 511.25, "completions/min_length": 475.0, "epoch": 3.8, "frac_reward_zero_std": 0.5, "grad_norm": 0.9561076164245605, "kl": 0.004891319200396538, "learning_rate": 9.755282581475767e-07, "loss": 4.966348933521658e-05, "reward": 0.23929166793823242, "reward_std": 0.0048319012857973576, "rewards/DrugCombAccuracyCOTORM/mean": 0.07906250655651093, "rewards/DrugCombAccuracyCOTORM/std": 0.08272884786128998, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7604166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.25069350004196167, "step": 2584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 459.4375, "completions/min_length": 385.0, "epoch": 3.8014705882352944, "frac_reward_zero_std": 0.5, "grad_norm": 1.10097336769104, "kl": 0.014261760748922825, "learning_rate": 9.754885853965038e-07, "loss": 0.00014794617891311646, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/mean_length": 478.625, "completions/min_length": 382.0, "epoch": 3.802941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.323254942893982, "kl": 0.007156079402193427, "learning_rate": 9.75448881321446e-07, "loss": 7.19800591468811e-05, "reward": 0.8631384968757629, "reward_std": 0.026117896661162376, "rewards/DrugCombAccuracyCOTORM/mean": 0.8472824692726135, "rewards/DrugCombAccuracyCOTORM/std": 0.16432899236679077, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8531249761581421, "rewards/DrugCombCoverageCOTORM/std": 0.152171790599823, "step": 2586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 439.875, "completions/min_length": 401.0, "epoch": 3.8044117647058826, "frac_reward_zero_std": 1.0, "grad_norm": 0.011691546998918056, "kl": 0.00529722950886935, "learning_rate": 9.754091459250193e-07, "loss": 5.275497096590698e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 456.625, "completions/min_length": 397.0, "epoch": 3.8058823529411763, "frac_reward_zero_std": 1.0, "grad_norm": 0.01637485809624195, "kl": 0.005285939434543252, "learning_rate": 9.753693792098412e-07, "loss": 5.283578502712771e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/mean_length": 508.0625, "completions/min_length": 396.0, "epoch": 3.807352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.915169358253479, "kl": 0.0057767475955188274, "learning_rate": 9.753295811785313e-07, "loss": 5.764514207839966e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/mean_length": 406.125, "completions/min_length": 356.0, "epoch": 3.8088235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.014616405591368675, "kl": 0.006163697107695043, "learning_rate": 9.752897518337114e-07, "loss": 6.153324648039415e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/mean_length": 555.375, "completions/min_length": 470.0, "epoch": 3.810294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9747492074966431, "kl": 0.004559229884762317, "learning_rate": 9.75249891178006e-07, "loss": 4.629790782928467e-05, "reward": 0.7552499771118164, "reward_std": 0.08361775428056717, "rewards/DrugCombAccuracyCOTORM/mean": 0.6966666579246521, "rewards/DrugCombAccuracyCOTORM/std": 0.3421500623226166, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 2591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 425.5, "completions/min_length": 380.0, "epoch": 3.8117647058823527, "frac_reward_zero_std": 1.0, "grad_norm": 0.01744949445128441, "kl": 0.006503273849375546, "learning_rate": 9.752099992140399e-07, "loss": 6.464304169639945e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 449.9375, "completions/min_length": 384.0, "epoch": 3.8132352941176473, "frac_reward_zero_std": 1.0, "grad_norm": 0.017922518774867058, "kl": 0.006165580591186881, "learning_rate": 9.751700759444417e-07, "loss": 6.149875844130293e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 457.125, "completions/min_length": 338.0, "epoch": 3.814705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.0974290370941162, "kl": 0.006378254271112382, "learning_rate": 9.751301213718416e-07, "loss": 6.339684478007257e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 451.3125, "completions/min_length": 382.0, "epoch": 3.8161764705882355, "frac_reward_zero_std": 0.5, "grad_norm": 0.811265230178833, "kl": 0.005096668435726315, "learning_rate": 9.750901354988714e-07, "loss": 5.102657087263651e-05, "reward": 0.6213333010673523, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.5475000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.41562002897262573, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 2595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 469.5625, "completions/min_length": 438.0, "epoch": 3.817647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.0452221632003784, "kl": 0.006708530709147453, "learning_rate": 9.750501183281654e-07, "loss": 6.718697841279209e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 486.4375, "completions/min_length": 398.0, "epoch": 3.8191176470588237, "frac_reward_zero_std": 0.5, "grad_norm": 1.0371973514556885, "kl": 0.006878096726723015, "learning_rate": 9.7501006986236e-07, "loss": 6.930530071258545e-05, "reward": 0.7851458787918091, "reward_std": 0.12196674197912216, "rewards/DrugCombAccuracyCOTORM/mean": 0.7620312571525574, "rewards/DrugCombAccuracyCOTORM/std": 0.3244814872741699, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7552083134651184, "rewards/DrugCombCoverageCOTORM/std": 0.5141234397888184, "step": 2597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 852.0, "completions/mean_length": 447.125, "completions/min_length": 367.0, "epoch": 3.8205882352941174, "frac_reward_zero_std": 0.5, "grad_norm": 0.6079848408699036, "kl": 0.004643574124202132, "learning_rate": 9.749699901040931e-07, "loss": 4.7773122787475586e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 458.5625, "completions/min_length": 401.0, "epoch": 3.822058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.041861653327942, "kl": 0.0051397260394878685, "learning_rate": 9.749298790560052e-07, "loss": 5.177987986826338e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 445.0625, "completions/min_length": 388.0, "epoch": 3.8235294117647056, "frac_reward_zero_std": 0.5, "grad_norm": 1.0526914596557617, "kl": 0.005283082369714975, "learning_rate": 9.748897367207389e-07, "loss": 5.2632785809691995e-05, "reward": 0.9416666626930237, "reward_std": 0.08017835766077042, "rewards/DrugCombAccuracyCOTORM/mean": 0.9583333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.11385500431060791, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 2600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 467.9375, "completions/min_length": 402.0, "epoch": 3.825, "frac_reward_zero_std": 1.0, "grad_norm": 0.014065028168261051, "kl": 0.006865631323307753, "learning_rate": 9.748495631009385e-07, "loss": 6.806636520195752e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 395.0, "completions/min_length": 364.0, "epoch": 3.826470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.07071990519762039, "kl": 0.011913818190805614, "learning_rate": 9.748093581992506e-07, "loss": 0.00012171166599728167, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 444.6875, "completions/min_length": 393.0, "epoch": 3.8279411764705884, "frac_reward_zero_std": 0.5, "grad_norm": 1.0037868022918701, "kl": 0.004342023283243179, "learning_rate": 9.747691220183236e-07, "loss": 4.3392181396484375e-05, "reward": 0.9833333492279053, "reward_std": 0.047140445560216904, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 423.8125, "completions/min_length": 364.0, "epoch": 3.8294117647058825, "frac_reward_zero_std": 0.5, "grad_norm": 0.9333858489990234, "kl": 0.005370253173168749, "learning_rate": 9.747288545608085e-07, "loss": 5.353157757781446e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 452.5625, "completions/min_length": 386.0, "epoch": 3.8308823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 1.3827706575393677, "kl": 0.006404673331417143, "learning_rate": 9.746885558293578e-07, "loss": 6.394088268280029e-05, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 2605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 428.25, "completions/min_length": 369.0, "epoch": 3.8323529411764707, "frac_reward_zero_std": 0.0, "grad_norm": 1.376296043395996, "kl": 0.006050114752724767, "learning_rate": 9.746482258266265e-07, "loss": 6.0733407735824585e-05, "reward": 0.629687488079071, "reward_std": 0.4222930073738098, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3125, "rewards/DrugCombCoverageCOTORM/std": 0.9464847445487976, "step": 2606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 403.5625, "completions/min_length": 322.0, "epoch": 3.833823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 1.9259891510009766, "kl": 0.006321940221823752, "learning_rate": 9.74607864555271e-07, "loss": 6.30691647529602e-05, "reward": 0.7875000238418579, "reward_std": 0.36780595779418945, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 441.3125, "completions/min_length": 349.0, "epoch": 3.835294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.1038964986801147, "kl": 0.004401982994750142, "learning_rate": 9.745674720179507e-07, "loss": 4.4092535972595215e-05, "reward": 0.6464166641235352, "reward_std": 0.22610123455524445, "rewards/DrugCombAccuracyCOTORM/mean": 0.6387500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.4844498932361603, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.9464848041534424, "step": 2608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 433.125, "completions/min_length": 382.0, "epoch": 3.836764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.008292462676763535, "kl": 0.004827325115911663, "learning_rate": 9.74527048217326e-07, "loss": 4.818918023374863e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/mean_length": 506.0, "completions/min_length": 425.0, "epoch": 3.838235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8283213973045349, "kl": 0.005101798567920923, "learning_rate": 9.744865931560604e-07, "loss": 5.0954840844497085e-05, "reward": 0.648687481880188, "reward_std": 0.11094880849123001, "rewards/DrugCombAccuracyCOTORM/mean": 0.5745312571525574, "rewards/DrugCombAccuracyCOTORM/std": 0.4781929850578308, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.890625, "rewards/DrugCombCoverageCOTORM/std": 0.1385064274072647, "step": 2610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/mean_length": 520.5, "completions/min_length": 428.0, "epoch": 3.8397058823529413, "frac_reward_zero_std": 0.0, "grad_norm": 1.4301005601882935, "kl": 0.006806417601183057, "learning_rate": 9.744461068368191e-07, "loss": 6.888061761856079e-05, "reward": 0.6000000238418579, "reward_std": 0.2828426957130432, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 448.625, "completions/min_length": 390.0, "epoch": 3.8411764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.00857988465577364, "kl": 0.004958895966410637, "learning_rate": 9.744055892622687e-07, "loss": 4.951081791659817e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 2612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 449.5625, "completions/min_length": 342.0, "epoch": 3.8426470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.0626590251922607, "kl": 0.005125776748172939, "learning_rate": 9.743650404350784e-07, "loss": 5.169377982383594e-05, "reward": 0.925000011920929, "reward_std": 0.14880476891994476, "rewards/DrugCombAccuracyCOTORM/mean": 0.90625, "rewards/DrugCombAccuracyCOTORM/std": 0.2719528079032898, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 428.4375, "completions/min_length": 355.0, "epoch": 3.8441176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.099703311920166, "kl": 0.006863524555228651, "learning_rate": 9.7432446035792e-07, "loss": 6.84782862663269e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 438.6875, "completions/min_length": 406.0, "epoch": 3.8455882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.007416778244078159, "kl": 0.00457276968518272, "learning_rate": 9.742838490334666e-07, "loss": 4.5518863771576434e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 428.875, "completions/min_length": 400.0, "epoch": 3.847058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.0307765007019043, "kl": 0.007878574659116566, "learning_rate": 9.742432064643934e-07, "loss": 7.735771941952407e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/mean_length": 410.0625, "completions/min_length": 350.0, "epoch": 3.848529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.04034329950809479, "kl": 0.007158509804867208, "learning_rate": 9.74202532653378e-07, "loss": 7.223600550787523e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 410.125, "completions/min_length": 355.0, "epoch": 3.85, "frac_reward_zero_std": 0.5, "grad_norm": 1.181467056274414, "kl": 0.00625090894754976, "learning_rate": 9.741618276030996e-07, "loss": 6.221234798431396e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/mean_length": 582.5625, "completions/min_length": 410.0, "epoch": 3.851470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.7407978177070618, "kl": 0.005150890094228089, "learning_rate": 9.7412109131624e-07, "loss": 5.113344013807364e-05, "reward": 0.29573503136634827, "reward_std": 0.22763003408908844, "rewards/DrugCombAccuracyCOTORM/mean": 0.2256583720445633, "rewards/DrugCombAccuracyCOTORM/std": 0.41295722126960754, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.15208333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.6441438794136047, "step": 2619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 471.875, "completions/min_length": 400.0, "epoch": 3.8529411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.012183000333607197, "kl": 0.005286927451379597, "learning_rate": 9.74080323795483e-07, "loss": 5.267009692033753e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/mean_length": 498.375, "completions/min_length": 378.0, "epoch": 3.8544117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 1.3963866233825684, "kl": 0.004984260885976255, "learning_rate": 9.740395250435138e-07, "loss": 4.968792200088501e-05, "reward": 0.3866071403026581, "reward_std": 0.1557251363992691, "rewards/DrugCombAccuracyCOTORM/mean": 0.3452380895614624, "rewards/DrugCombAccuracyCOTORM/std": 0.41129186749458313, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.10416668653488159, "rewards/DrugCombCoverageCOTORM/std": 1.0089874267578125, "step": 2621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 450.1875, "completions/min_length": 403.0, "epoch": 3.8558823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.01603974960744381, "kl": 0.00483719480689615, "learning_rate": 9.739986950630202e-07, "loss": 4.830334728467278e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 500.0, "completions/min_length": 406.0, "epoch": 3.8573529411764707, "frac_reward_zero_std": 0.0, "grad_norm": 1.3919577598571777, "kl": 0.005611037486232817, "learning_rate": 9.739578338566922e-07, "loss": 5.589425563812256e-05, "reward": 0.6567916870117188, "reward_std": 0.25761985778808594, "rewards/DrugCombAccuracyCOTORM/mean": 0.5853124856948853, "rewards/DrugCombAccuracyCOTORM/std": 0.3781290352344513, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8854166269302368, "rewards/DrugCombCoverageCOTORM/std": 0.11735905706882477, "step": 2623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 463.625, "completions/min_length": 402.0, "epoch": 3.8588235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.015371517278254032, "kl": 0.0059549647849053144, "learning_rate": 9.739169414272217e-07, "loss": 5.9052734286524355e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 422.625, "completions/min_length": 370.0, "epoch": 3.860294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.010833999142050743, "kl": 0.005899813841097057, "learning_rate": 9.738760177773025e-07, "loss": 5.914106804993935e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 392.125, "completions/min_length": 341.0, "epoch": 3.861764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.008086414076387882, "kl": 0.00553769723046571, "learning_rate": 9.738350629096302e-07, "loss": 5.5077558499760926e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/mean_length": 429.1875, "completions/min_length": 358.0, "epoch": 3.863235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0884833335876465, "kl": 0.0064377011731266975, "learning_rate": 9.737940768269032e-07, "loss": 6.312354526016861e-05, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 2627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 439.5625, "completions/min_length": 381.0, "epoch": 3.864705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 1.4213321208953857, "kl": 0.005628008744679391, "learning_rate": 9.737530595318215e-07, "loss": 5.6587159633636475e-05, "reward": 0.7875000238418579, "reward_std": 0.3934735357761383, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 2628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/mean_length": 521.0, "completions/min_length": 368.0, "epoch": 3.8661764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.4230177402496338, "kl": 0.006253859610296786, "learning_rate": 9.737120110270872e-07, "loss": 6.34118914604187e-05, "reward": 0.42704758048057556, "reward_std": 0.1534026861190796, "rewards/DrugCombAccuracyCOTORM/mean": 0.3041219115257263, "rewards/DrugCombAccuracyCOTORM/std": 0.37451526522636414, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8374999761581421, "rewards/DrugCombCoverageCOTORM/std": 0.5004997849464417, "step": 2629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 461.0625, "completions/min_length": 410.0, "epoch": 3.8676470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.007869942113757133, "kl": 0.0042336914339102805, "learning_rate": 9.736709313154042e-07, "loss": 4.214228829368949e-05, "reward": 0.6713333129882812, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6100000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.40279027819633484, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 2630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 485.6875, "completions/min_length": 417.0, "epoch": 3.8691176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.8670250177383423, "kl": 0.004643922904506326, "learning_rate": 9.736298203994792e-07, "loss": 4.669651389122009e-05, "reward": 0.7937500476837158, "reward_std": 0.36611872911453247, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 441.0625, "completions/min_length": 340.0, "epoch": 3.8705882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.9518478512763977, "kl": 0.006174073903821409, "learning_rate": 9.735886782820201e-07, "loss": 6.13182783126831e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 454.375, "completions/min_length": 400.0, "epoch": 3.8720588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.4833219051361084, "kl": 0.006374167860485613, "learning_rate": 9.735475049657374e-07, "loss": 6.395578384399414e-05, "reward": 0.699999988079071, "reward_std": 0.3484410345554352, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/mean_length": 533.5625, "completions/min_length": 488.0, "epoch": 3.873529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.319795846939087, "kl": 0.005103518022224307, "learning_rate": 9.735063004533435e-07, "loss": 5.0649046897888184e-05, "reward": 0.41833335161209106, "reward_std": 0.2802271842956543, "rewards/DrugCombAccuracyCOTORM/mean": 0.35624998807907104, "rewards/DrugCombAccuracyCOTORM/std": 0.38981834053993225, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3333333432674408, "rewards/DrugCombCoverageCOTORM/std": 0.7200823426246643, "step": 2634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 451.5625, "completions/min_length": 363.0, "epoch": 3.875, "frac_reward_zero_std": 0.5, "grad_norm": 1.1671205759048462, "kl": 0.005804293556138873, "learning_rate": 9.73465064747553e-07, "loss": 5.7660043239593506e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 419.125, "completions/min_length": 350.0, "epoch": 3.876470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.012511846609413624, "kl": 0.005349682178348303, "learning_rate": 9.734237978510817e-07, "loss": 5.310621781973168e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 455.375, "completions/min_length": 362.0, "epoch": 3.8779411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.5172618627548218, "kl": 0.005471033917274326, "learning_rate": 9.733824997666491e-07, "loss": 5.4895877838134766e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990126132965, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 471.0, "completions/min_length": 418.0, "epoch": 3.8794117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.014387469738721848, "kl": 0.005073063657619059, "learning_rate": 9.733411704969753e-07, "loss": 5.071596024208702e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 2638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 453.25, "completions/min_length": 400.0, "epoch": 3.8808823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.014286196790635586, "kl": 0.005412984814029187, "learning_rate": 9.73299810044783e-07, "loss": 5.4097392421681434e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 437.1875, "completions/min_length": 367.0, "epoch": 3.8823529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.009738760069012642, "kl": 0.004112701339181513, "learning_rate": 9.732584184127973e-07, "loss": 4.1360119212185964e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 481.875, "completions/min_length": 417.0, "epoch": 3.8838235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0299808979034424, "kl": 0.005040301475673914, "learning_rate": 9.732169956037443e-07, "loss": 5.029141902923584e-05, "reward": 0.9833333492279053, "reward_std": 0.047140445560216904, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 407.125, "completions/min_length": 343.0, "epoch": 3.885294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 2.766432285308838, "kl": 0.004677946446463466, "learning_rate": 9.731755416203533e-07, "loss": 4.688459375756793e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 2642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 466.625, "completions/min_length": 414.0, "epoch": 3.886764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8335770964622498, "kl": 0.005493533390108496, "learning_rate": 9.731340564653552e-07, "loss": 5.3860247135162354e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 2643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/mean_length": 458.5625, "completions/min_length": 376.0, "epoch": 3.888235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.009410625323653221, "kl": 0.004549265780951828, "learning_rate": 9.730925401414828e-07, "loss": 4.5495144149754196e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 467.0625, "completions/min_length": 398.0, "epoch": 3.889705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 1.372370958328247, "kl": 0.006056776794139296, "learning_rate": 9.73050992651471e-07, "loss": 6.021559238433838e-05, "reward": 0.7977499961853027, "reward_std": 0.37449419498443604, "rewards/DrugCombAccuracyCOTORM/mean": 0.7706249952316284, "rewards/DrugCombAccuracyCOTORM/std": 0.4125242531299591, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.35939764976501465, "step": 2645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 452.125, "completions/min_length": 381.0, "epoch": 3.8911764705882352, "frac_reward_zero_std": 0.5, "grad_norm": 0.9678066372871399, "kl": 0.008067352464422584, "learning_rate": 9.730094139980568e-07, "loss": 7.99732661107555e-05, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 2646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 434.1875, "completions/min_length": 371.0, "epoch": 3.8926470588235293, "frac_reward_zero_std": 0.0, "grad_norm": 1.2316187620162964, "kl": 0.007894585723988712, "learning_rate": 9.729678041839795e-07, "loss": 7.89538025856018e-05, "reward": 0.8999999761581421, "reward_std": 0.2828426957130432, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 445.875, "completions/min_length": 391.0, "epoch": 3.8941176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.1455308198928833, "kl": 0.008589377743192017, "learning_rate": 9.729261632119806e-07, "loss": 8.83557804627344e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/mean_length": 417.3125, "completions/min_length": 374.0, "epoch": 3.8955882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.01052623987197876, "kl": 0.005244682659395039, "learning_rate": 9.728844910848024e-07, "loss": 5.2466155466390774e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/mean_length": 543.75, "completions/min_length": 498.0, "epoch": 3.8970588235294117, "frac_reward_zero_std": 0.0, "grad_norm": 1.3988438844680786, "kl": 0.006733338930644095, "learning_rate": 9.728427878051907e-07, "loss": 6.762892007827759e-05, "reward": 0.4864687919616699, "reward_std": 0.24467135965824127, "rewards/DrugCombAccuracyCOTORM/mean": 0.4078906178474426, "rewards/DrugCombAccuracyCOTORM/std": 0.4189172089099884, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6015625, "rewards/DrugCombCoverageCOTORM/std": 0.3571261167526245, "step": 2650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 466.375, "completions/min_length": 423.0, "epoch": 3.898529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.010783988051116467, "kl": 0.0048497566021978855, "learning_rate": 9.72801053375893e-07, "loss": 4.8359372158301994e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 425.5625, "completions/min_length": 363.0, "epoch": 3.9, "frac_reward_zero_std": 1.0, "grad_norm": 0.017905768007040024, "kl": 0.006787714082747698, "learning_rate": 9.727592877996584e-07, "loss": 6.828620098531246e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 434.5625, "completions/min_length": 373.0, "epoch": 3.901470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.06208116561174393, "kl": 0.0071005127392709255, "learning_rate": 9.72717491079238e-07, "loss": 7.130853919079527e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 428.1875, "completions/min_length": 373.0, "epoch": 3.902941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.011836073361337185, "kl": 0.005028901330661029, "learning_rate": 9.72675663217386e-07, "loss": 4.984916449757293e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 456.375, "completions/min_length": 389.0, "epoch": 3.9044117647058822, "frac_reward_zero_std": 0.5, "grad_norm": 0.9302031397819519, "kl": 0.0055356164230033755, "learning_rate": 9.726338042168574e-07, "loss": 5.5342912673950195e-05, "reward": 0.737500011920929, "reward_std": 0.2183542400598526, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 2655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 410.5, "completions/min_length": 356.0, "epoch": 3.9058823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.042763352394104, "kl": 0.005167149240151048, "learning_rate": 9.725919140804098e-07, "loss": 5.1838178478647023e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 441.5625, "completions/min_length": 375.0, "epoch": 3.9073529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.015004703775048256, "kl": 0.005941925337538123, "learning_rate": 9.725499928108027e-07, "loss": 5.947726822341792e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 448.25, "completions/min_length": 386.0, "epoch": 3.9088235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.009973244741559029, "kl": 0.004983065300621092, "learning_rate": 9.725080404107982e-07, "loss": 4.968188295606524e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 424.5, "completions/min_length": 387.0, "epoch": 3.9102941176470587, "frac_reward_zero_std": 1.0, "grad_norm": 0.010306855663657188, "kl": 0.005181329499464482, "learning_rate": 9.724660568831597e-07, "loss": 5.190322553971782e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 481.8125, "completions/min_length": 378.0, "epoch": 3.911764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0979807376861572, "kl": 0.006999610923230648, "learning_rate": 9.72424042230653e-07, "loss": 6.962567567825317e-05, "reward": 0.7854499816894531, "reward_std": 0.19620780646800995, "rewards/DrugCombAccuracyCOTORM/mean": 0.7474374771118164, "rewards/DrugCombAccuracyCOTORM/std": 0.4070213735103607, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.2294922024011612, "step": 2660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 481.375, "completions/min_length": 432.0, "epoch": 3.913235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0074615478515625, "kl": 0.006549085606820881, "learning_rate": 9.72381996456046e-07, "loss": 6.465319165727124e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 2661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 423.25, "completions/min_length": 375.0, "epoch": 3.914705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.009248370304703712, "kl": 0.005057527916505933, "learning_rate": 9.723399195621085e-07, "loss": 5.0836093578254804e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 405.0625, "completions/min_length": 364.0, "epoch": 3.916176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 1.6092934608459473, "kl": 0.007319501484744251, "learning_rate": 9.722978115516124e-07, "loss": 7.293373346328735e-05, "reward": 0.7875000238418579, "reward_std": 0.3934735357761383, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 2663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 436.8125, "completions/min_length": 391.0, "epoch": 3.9176470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.20652979612350464, "kl": 0.013114329776726663, "learning_rate": 9.722556724273318e-07, "loss": 0.00013382633915171027, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 450.5, "completions/min_length": 413.0, "epoch": 3.9191176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.01170336827635765, "kl": 0.005505293724127114, "learning_rate": 9.722135021920425e-07, "loss": 5.4671429097652435e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 459.625, "completions/min_length": 378.0, "epoch": 3.9205882352941175, "frac_reward_zero_std": 0.5, "grad_norm": 1.2081362009048462, "kl": 0.005999926419463009, "learning_rate": 9.72171300848523e-07, "loss": 5.978457193123177e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 2666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 431.5625, "completions/min_length": 358.0, "epoch": 3.9220588235294116, "frac_reward_zero_std": 1.0, "grad_norm": 0.013920494355261326, "kl": 0.00531373277772218, "learning_rate": 9.721290683995526e-07, "loss": 5.3252631914801896e-05, "reward": 0.7666666507720947, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.3442651927471161, "step": 2667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/mean_length": 536.3125, "completions/min_length": 481.0, "epoch": 3.923529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.4584548473358154, "kl": 0.005388106743339449, "learning_rate": 9.720868048479145e-07, "loss": 5.4366886615753174e-05, "reward": 0.9159375429153442, "reward_std": 0.2377646416425705, "rewards/DrugCombAccuracyCOTORM/mean": 0.9007812738418579, "rewards/DrugCombAccuracyCOTORM/std": 0.27153533697128296, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.953125, "rewards/DrugCombCoverageCOTORM/std": 0.1359764039516449, "step": 2668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 457.875, "completions/min_length": 374.0, "epoch": 3.925, "frac_reward_zero_std": 0.5, "grad_norm": 1.243138074874878, "kl": 0.010033339727669954, "learning_rate": 9.72044510196392e-07, "loss": 0.00010273512452840805, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 460.8125, "completions/min_length": 417.0, "epoch": 3.9264705882352944, "frac_reward_zero_std": 1.0, "grad_norm": 0.009937825612723827, "kl": 0.0052037694258615375, "learning_rate": 9.72002184447772e-07, "loss": 5.221518949838355e-05, "reward": 0.6410000324249268, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5824999809265137, "rewards/DrugCombAccuracyCOTORM/std": 0.43119215965270996, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 2670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 485.8125, "completions/min_length": 432.0, "epoch": 3.927941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.016467854380607605, "kl": 0.009598452248610556, "learning_rate": 9.719598276048425e-07, "loss": 9.584898361936212e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 2671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/mean_length": 469.3125, "completions/min_length": 364.0, "epoch": 3.9294117647058826, "frac_reward_zero_std": 0.5, "grad_norm": 1.0630439519882202, "kl": 0.0070070428773760796, "learning_rate": 9.71917439670394e-07, "loss": 6.942364416318014e-05, "reward": 0.9291666746139526, "reward_std": 0.07572401314973831, "rewards/DrugCombAccuracyCOTORM/mean": 0.9166666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.14907118678092957, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.07453560829162598, "step": 2672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 440.75, "completions/min_length": 390.0, "epoch": 3.9308823529411763, "frac_reward_zero_std": 1.0, "grad_norm": 0.007038698997348547, "kl": 0.00444881443399936, "learning_rate": 9.71875020647219e-07, "loss": 4.440393968252465e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/mean_length": 564.75, "completions/min_length": 407.0, "epoch": 3.932352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 21.81256103515625, "kl": 0.21466017770580947, "learning_rate": 9.718325705381115e-07, "loss": 0.0018952381797134876, "reward": 0.7140595316886902, "reward_std": 0.18231938779354095, "rewards/DrugCombAccuracyCOTORM/mean": 0.6758333444595337, "rewards/DrugCombAccuracyCOTORM/std": 0.42195576429367065, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7339285612106323, "rewards/DrugCombCoverageCOTORM/std": 0.6798734664916992, "step": 2674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 494.5, "completions/min_length": 392.0, "epoch": 3.9338235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 1.1792614459991455, "kl": 0.008048457966651767, "learning_rate": 9.717900893458684e-07, "loss": 8.042631088756025e-05, "reward": 0.6276666522026062, "reward_std": 0.16100794076919556, "rewards/DrugCombAccuracyCOTORM/mean": 0.5762500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.49902406334877014, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.6992059350013733, "step": 2675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 424.4375, "completions/min_length": 341.0, "epoch": 3.935294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.014254036359488964, "kl": 0.005150897428393364, "learning_rate": 9.71747577073288e-07, "loss": 5.122430593473837e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/mean_length": 517.5625, "completions/min_length": 455.0, "epoch": 3.9367647058823527, "frac_reward_zero_std": 0.5, "grad_norm": 0.9244515299797058, "kl": 0.004595474747475237, "learning_rate": 9.717050337231714e-07, "loss": 4.608556628227234e-05, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 2677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 447.25, "completions/min_length": 345.0, "epoch": 3.9382352941176473, "frac_reward_zero_std": 0.5, "grad_norm": 0.9525043368339539, "kl": 0.005645800265483558, "learning_rate": 9.716624592983206e-07, "loss": 5.606561899185181e-05, "reward": 0.5547500252723694, "reward_std": 0.06465678662061691, "rewards/DrugCombAccuracyCOTORM/mean": 0.5137500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.5050000548362732, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.8668269515037537, "step": 2678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 449.5625, "completions/min_length": 361.0, "epoch": 3.939705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.011173461563885212, "kl": 0.005139426211826503, "learning_rate": 9.71619853801541e-07, "loss": 5.147729461896233e-05, "reward": 0.625333309173584, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5733333230018616, "rewards/DrugCombAccuracyCOTORM/std": 0.44065946340560913, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.3442651927471161, "step": 2679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 454.25, "completions/min_length": 393.0, "epoch": 3.9411764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.011423139832913876, "kl": 0.0055848382180556655, "learning_rate": 9.715772172356386e-07, "loss": 5.581176083069295e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 466.125, "completions/min_length": 368.0, "epoch": 3.942647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.008295542560517788, "kl": 0.004833028418943286, "learning_rate": 9.715345496034229e-07, "loss": 4.805600474355742e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/mean_length": 489.9375, "completions/min_length": 427.0, "epoch": 3.9441176470588237, "frac_reward_zero_std": 0.5, "grad_norm": 1.0831971168518066, "kl": 0.006288826349191368, "learning_rate": 9.71491850907704e-07, "loss": 6.312534242169932e-05, "reward": 0.20000000298023224, "reward_std": 0.16903084516525269, "rewards/DrugCombAccuracyCOTORM/mean": 0.0625, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 2682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 458.3125, "completions/min_length": 380.0, "epoch": 3.9455882352941174, "frac_reward_zero_std": 1.0, "grad_norm": 0.05626474320888519, "kl": 0.00824762531556189, "learning_rate": 9.714491211512955e-07, "loss": 8.067404996836558e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 475.9375, "completions/min_length": 404.0, "epoch": 3.947058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.013129767030477524, "kl": 0.0050506870611570776, "learning_rate": 9.714063603370117e-07, "loss": 5.0497212214395404e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 451.875, "completions/min_length": 397.0, "epoch": 3.9485294117647056, "frac_reward_zero_std": 0.5, "grad_norm": 1.3539032936096191, "kl": 0.0072718076989986, "learning_rate": 9.713635684676699e-07, "loss": 7.398426532745361e-05, "reward": 0.692187488079071, "reward_std": 0.1912059336900711, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 449.0625, "completions/min_length": 379.0, "epoch": 3.95, "frac_reward_zero_std": 1.0, "grad_norm": 0.007827644236385822, "kl": 0.005021596967708319, "learning_rate": 9.713207455460892e-07, "loss": 5.0257389375474304e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 436.75, "completions/min_length": 371.0, "epoch": 3.951470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.011632395908236504, "kl": 0.004939076374284923, "learning_rate": 9.712778915750903e-07, "loss": 4.935808829031885e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 491.0, "completions/min_length": 397.0, "epoch": 3.9529411764705884, "frac_reward_zero_std": 0.5, "grad_norm": 1.004357099533081, "kl": 0.007051958818919957, "learning_rate": 9.712350065574968e-07, "loss": 7.095932960510254e-05, "reward": 0.9089166522026062, "reward_std": 0.16972768306732178, "rewards/DrugCombAccuracyCOTORM/mean": 0.8887500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.30663496255874634, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 2688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 436.8125, "completions/min_length": 409.0, "epoch": 3.9544117647058825, "frac_reward_zero_std": 0.5, "grad_norm": 1.2695817947387695, "kl": 0.0056470162235200405, "learning_rate": 9.711920904961337e-07, "loss": 5.6354539992753416e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 442.875, "completions/min_length": 391.0, "epoch": 3.9558823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 0.9604895710945129, "kl": 0.006452080560848117, "learning_rate": 9.71149143393828e-07, "loss": 6.426495383493602e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 457.875, "completions/min_length": 379.0, "epoch": 3.9573529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 0.8421921133995056, "kl": 0.004408286651596427, "learning_rate": 9.711061652534089e-07, "loss": 4.382431507110596e-05, "reward": 0.17500001192092896, "reward_std": 0.1752549111843109, "rewards/DrugCombAccuracyCOTORM/mean": 0.0625, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.25, "rewards/DrugCombCoverageCOTORM/std": 1.0, "step": 2691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/mean_length": 530.75, "completions/min_length": 457.0, "epoch": 3.958823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.0412731170654297, "kl": 0.006910001276992261, "learning_rate": 9.710631560777082e-07, "loss": 6.822608702350408e-05, "reward": 0.5833333730697632, "reward_std": 0.047140445560216904, "rewards/DrugCombAccuracyCOTORM/mean": 0.4791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.5013870000839233, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 404.8125, "completions/min_length": 352.0, "epoch": 3.960294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.012123906053602695, "kl": 0.0045923967263661325, "learning_rate": 9.710201158695586e-07, "loss": 4.57812930108048e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 446.6875, "completions/min_length": 387.0, "epoch": 3.961764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8087146878242493, "kl": 0.006046441150829196, "learning_rate": 9.709770446317958e-07, "loss": 6.0383867094060406e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/mean_length": 553.0, "completions/min_length": 439.0, "epoch": 3.963235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.4121986627578735, "kl": 0.00626464857487008, "learning_rate": 9.709339423672574e-07, "loss": 6.269942969083786e-05, "reward": 0.7512670159339905, "reward_std": 0.21508750319480896, "rewards/DrugCombAccuracyCOTORM/mean": 0.6903858184814453, "rewards/DrugCombAccuracyCOTORM/std": 0.28940585255622864, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9895833730697632, "rewards/DrugCombCoverageCOTORM/std": 0.02846374548971653, "step": 2695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 415.5625, "completions/min_length": 368.0, "epoch": 3.9647058823529413, "frac_reward_zero_std": 1.0, "grad_norm": 0.00831991620361805, "kl": 0.0043766762828454375, "learning_rate": 9.708908090787824e-07, "loss": 4.337586506153457e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 2696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 424.625, "completions/min_length": 383.0, "epoch": 3.9661764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.015010935254395008, "kl": 0.005354786699172109, "learning_rate": 9.708476447692126e-07, "loss": 5.320560740074143e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/mean_length": 518.9375, "completions/min_length": 458.0, "epoch": 3.9676470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.2220611572265625, "kl": 0.0053079730132594705, "learning_rate": 9.708044494413917e-07, "loss": 5.3843483328819275e-05, "reward": 0.7358333468437195, "reward_std": 0.1294279247522354, "rewards/DrugCombAccuracyCOTORM/mean": 0.6958333253860474, "rewards/DrugCombAccuracyCOTORM/std": 0.3730008602142334, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.2950204014778137, "step": 2698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/mean_length": 502.25, "completions/min_length": 402.0, "epoch": 3.9691176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.2494293451309204, "kl": 0.014662184985354543, "learning_rate": 9.707612230981651e-07, "loss": 0.0001555904746055603, "reward": 0.42976629734039307, "reward_std": 0.3798466920852661, "rewards/DrugCombAccuracyCOTORM/mean": 0.32236412167549133, "rewards/DrugCombAccuracyCOTORM/std": 0.47344207763671875, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.71875, "rewards/DrugCombCoverageCOTORM/std": 0.5764474868774414, "step": 2699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 425.25, "completions/min_length": 387.0, "epoch": 3.9705882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.015024403110146523, "kl": 0.006280898698605597, "learning_rate": 9.707179657423805e-07, "loss": 6.267747812671587e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 486.25, "completions/min_length": 438.0, "epoch": 3.972058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.974919319152832, "kl": 0.004986764746718109, "learning_rate": 9.706746773768875e-07, "loss": 5.0211678171763197e-05, "reward": 0.7927083373069763, "reward_std": 0.2482253462076187, "rewards/DrugCombAccuracyCOTORM/mean": 0.7916666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.4013864994049072, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.59375, "rewards/DrugCombCoverageCOTORM/std": 0.8003905415534973, "step": 2701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 465.75, "completions/min_length": 409.0, "epoch": 3.973529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.00998733937740326, "kl": 0.0053712931694462895, "learning_rate": 9.706313580045377e-07, "loss": 5.392981620389037e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 447.4375, "completions/min_length": 399.0, "epoch": 3.975, "frac_reward_zero_std": 0.5, "grad_norm": 0.8344728946685791, "kl": 0.005502121173776686, "learning_rate": 9.705880076281853e-07, "loss": 5.4641044698655605e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 2703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/mean_length": 411.0625, "completions/min_length": 361.0, "epoch": 3.976470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.011354454793035984, "kl": 0.0052505695493891835, "learning_rate": 9.705446262506857e-07, "loss": 5.266086009214632e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 483.0625, "completions/min_length": 447.0, "epoch": 3.9779411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.9514592289924622, "kl": 0.006648858543485403, "learning_rate": 9.70501213874897e-07, "loss": 6.646617839578539e-05, "reward": 0.8531041741371155, "reward_std": 0.014790322631597519, "rewards/DrugCombAccuracyCOTORM/mean": 0.8254948258399963, "rewards/DrugCombAccuracyCOTORM/std": 0.1827559769153595, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9270833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.08539126813411713, "step": 2705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 447.125, "completions/min_length": 358.0, "epoch": 3.9794117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.0080775022506714, "kl": 0.006815838976763189, "learning_rate": 9.704577705036792e-07, "loss": 6.793742795707658e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 460.3125, "completions/min_length": 386.0, "epoch": 3.9808823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.8388679623603821, "kl": 0.0047846881789155304, "learning_rate": 9.704142961398939e-07, "loss": 4.82611358165741e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 477.75, "completions/min_length": 421.0, "epoch": 3.9823529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 1.0957623720169067, "kl": 0.005881379824131727, "learning_rate": 9.70370790786405e-07, "loss": 5.8332065236754715e-05, "reward": 0.1979166567325592, "reward_std": 0.005892557092010975, "rewards/DrugCombAccuracyCOTORM/mean": 0.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 2708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 426.125, "completions/min_length": 368.0, "epoch": 3.9838235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.009899981319904327, "kl": 0.005085767479613423, "learning_rate": 9.703272544460791e-07, "loss": 5.0724313041428104e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/mean_length": 452.75, "completions/min_length": 396.0, "epoch": 3.985294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.375478744506836, "kl": 0.0058119233581237495, "learning_rate": 9.702836871217837e-07, "loss": 5.9664249420166016e-05, "reward": 0.6000000238418579, "reward_std": 0.37032803893089294, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/mean_length": 419.5625, "completions/min_length": 383.0, "epoch": 3.986764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.008860195986926556, "kl": 0.00572461576666683, "learning_rate": 9.702400888163894e-07, "loss": 5.739941116189584e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/mean_length": 513.8125, "completions/min_length": 424.0, "epoch": 3.988235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.4946026802062988, "kl": 0.007467252900823951, "learning_rate": 9.701964595327679e-07, "loss": 7.443130016326904e-05, "reward": 0.7740449905395508, "reward_std": 0.25919878482818604, "rewards/DrugCombAccuracyCOTORM/mean": 0.7370875477790833, "rewards/DrugCombAccuracyCOTORM/std": 0.33781251311302185, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.25516778230667114, "step": 2712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 506.375, "completions/min_length": 438.0, "epoch": 3.989705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.009958172217011452, "kl": 0.00453780050156638, "learning_rate": 9.701527992737935e-07, "loss": 4.5396598579827696e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/mean_length": 496.5625, "completions/min_length": 376.0, "epoch": 3.9911764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.6003856658935547, "kl": 0.005192571203224361, "learning_rate": 9.701091080423428e-07, "loss": 5.1666051149368286e-05, "reward": 0.7987916469573975, "reward_std": 0.2501852214336395, "rewards/DrugCombAccuracyCOTORM/mean": 0.79666668176651, "rewards/DrugCombAccuracyCOTORM/std": 0.40022218227386475, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6145833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.8021238446235657, "step": 2714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/mean_length": 541.125, "completions/min_length": 483.0, "epoch": 3.9926470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.2234373092651367, "kl": 0.005318815936334431, "learning_rate": 9.700653858412934e-07, "loss": 5.357712507247925e-05, "reward": 0.75, "reward_std": 0.31124579906463623, "rewards/DrugCombAccuracyCOTORM/mean": 0.71875, "rewards/DrugCombAccuracyCOTORM/std": 0.44604745507240295, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 2715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 436.8125, "completions/min_length": 387.0, "epoch": 3.9941176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.024348117411136627, "kl": 0.006643270025961101, "learning_rate": 9.70021632673526e-07, "loss": 6.635571480728686e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/mean_length": 486.1875, "completions/min_length": 391.0, "epoch": 3.9955882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.2133680582046509, "kl": 0.006197621114552021, "learning_rate": 9.699778485419231e-07, "loss": 6.129592657089233e-05, "reward": 0.59375, "reward_std": 0.0176776684820652, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 468.375, "completions/min_length": 388.0, "epoch": 3.9970588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.4849241971969604, "kl": 0.0070510918740183115, "learning_rate": 9.69934033449369e-07, "loss": 7.064640522003174e-05, "reward": 0.7555000185966492, "reward_std": 0.2810010015964508, "rewards/DrugCombAccuracyCOTORM/mean": 0.7256250381469727, "rewards/DrugCombAccuracyCOTORM/std": 0.34456998109817505, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.5055250525474548, "step": 2718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 466.1875, "completions/min_length": 368.0, "epoch": 3.998529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.01072433590888977, "kl": 0.005501586711034179, "learning_rate": 9.698901873987498e-07, "loss": 5.5640703067183495e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 448.8125, "completions/min_length": 402.0, "epoch": 4.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.016584662720561028, "kl": 0.005111579957883805, "learning_rate": 9.698463103929541e-07, "loss": 5.129391502123326e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 437.0625, "completions/min_length": 369.0, "epoch": 4.001470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.0149996941909194, "kl": 0.005354280467145145, "learning_rate": 9.698024024348727e-07, "loss": 5.39163411303889e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 468.0, "completions/min_length": 409.0, "epoch": 4.002941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.5129165649414062, "kl": 0.005959968664683402, "learning_rate": 9.697584635273979e-07, "loss": 5.930662155151367e-05, "reward": 0.6937500238418579, "reward_std": 0.42636024951934814, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.8920949101448059, "step": 2722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 457.75, "completions/min_length": 370.0, "epoch": 4.004411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 1.5273576974868774, "kl": 0.007971868733875453, "learning_rate": 9.697144936734242e-07, "loss": 8.028745651245117e-05, "reward": 0.6655833721160889, "reward_std": 0.2713155150413513, "rewards/DrugCombAccuracyCOTORM/mean": 0.5884895920753479, "rewards/DrugCombAccuracyCOTORM/std": 0.38875919580459595, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9479166269302368, "rewards/DrugCombCoverageCOTORM/std": 0.07978560030460358, "step": 2723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/mean_length": 372.3125, "completions/min_length": 301.0, "epoch": 4.0058823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.0833165645599365, "kl": 0.0043667504214681685, "learning_rate": 9.696704928758487e-07, "loss": 4.3364849261706695e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 461.5, "completions/min_length": 403.0, "epoch": 4.007352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.025660181418061256, "kl": 0.006921412190422416, "learning_rate": 9.696264611375695e-07, "loss": 6.939849117770791e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 483.25, "completions/min_length": 413.0, "epoch": 4.008823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.00862562283873558, "kl": 0.005421363515779376, "learning_rate": 9.695823984614876e-07, "loss": 5.4285512305796146e-05, "reward": 0.7666666507720947, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.3442651927471161, "step": 2726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 467.1875, "completions/min_length": 398.0, "epoch": 4.010294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.012566271238029003, "kl": 0.005777093698270619, "learning_rate": 9.695383048505054e-07, "loss": 5.7801247749011964e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 452.875, "completions/min_length": 408.0, "epoch": 4.011764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.007222205866128206, "kl": 0.003985368355643004, "learning_rate": 9.694941803075283e-07, "loss": 4.001462002634071e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/mean_length": 401.3125, "completions/min_length": 365.0, "epoch": 4.0132352941176475, "frac_reward_zero_std": 1.0, "grad_norm": 0.027093637734651566, "kl": 0.0060485865687951446, "learning_rate": 9.694500248354624e-07, "loss": 5.967647666693665e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 438.9375, "completions/min_length": 382.0, "epoch": 4.014705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.007371609564870596, "kl": 0.0045313797309063375, "learning_rate": 9.694058384372172e-07, "loss": 4.526649718172848e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 450.125, "completions/min_length": 372.0, "epoch": 4.016176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.0753051042556763, "kl": 0.006116488832049072, "learning_rate": 9.69361621115703e-07, "loss": 6.0573729570023715e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 434.8125, "completions/min_length": 367.0, "epoch": 4.017647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.9479577541351318, "kl": 0.005064167664386332, "learning_rate": 9.693173728738334e-07, "loss": 5.028887608204968e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 429.375, "completions/min_length": 393.0, "epoch": 4.019117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.8714517951011658, "kl": 0.004174269910436124, "learning_rate": 9.692730937145226e-07, "loss": 4.166199141764082e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 475.625, "completions/min_length": 416.0, "epoch": 4.020588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.0431976318359375, "kl": 0.004824814153835177, "learning_rate": 9.69228783640688e-07, "loss": 4.827231168746948e-05, "reward": 0.9551249742507935, "reward_std": 0.12692566215991974, "rewards/DrugCombAccuracyCOTORM/mean": 0.9478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.20874999463558197, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 2734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 432.125, "completions/min_length": 391.0, "epoch": 4.022058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.009168645367026329, "kl": 0.004882778972387314, "learning_rate": 9.691844426552487e-07, "loss": 4.9116177251562476e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 454.0625, "completions/min_length": 396.0, "epoch": 4.023529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8409298062324524, "kl": 0.005372987710870802, "learning_rate": 9.691400707611256e-07, "loss": 5.404531111707911e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/mean_length": 402.1875, "completions/min_length": 361.0, "epoch": 4.025, "frac_reward_zero_std": 1.0, "grad_norm": 0.01163613423705101, "kl": 0.005069133476354182, "learning_rate": 9.69095667961242e-07, "loss": 5.076222441857681e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 2737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 449.1875, "completions/min_length": 384.0, "epoch": 4.026470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9610134363174438, "kl": 0.008385724853724241, "learning_rate": 9.690512342585228e-07, "loss": 8.504141442244872e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 447.3125, "completions/min_length": 378.0, "epoch": 4.027941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 0.8614036440849304, "kl": 0.007237027515657246, "learning_rate": 9.690067696558954e-07, "loss": 7.232793723233044e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 490.5, "completions/min_length": 436.0, "epoch": 4.029411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.011357373557984829, "kl": 0.00662202516105026, "learning_rate": 9.689622741562891e-07, "loss": 6.60801088088192e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 436.9375, "completions/min_length": 398.0, "epoch": 4.030882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.01204483862966299, "kl": 0.006204410106875002, "learning_rate": 9.689177477626346e-07, "loss": 6.16456163697876e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 434.4375, "completions/min_length": 373.0, "epoch": 4.0323529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.9572342038154602, "kl": 0.00435026059858501, "learning_rate": 9.688731904778658e-07, "loss": 4.329904913902283e-05, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 2742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 415.625, "completions/min_length": 342.0, "epoch": 4.033823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.019999714568257332, "kl": 0.005132106598466635, "learning_rate": 9.688286023049176e-07, "loss": 5.177484126761556e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/mean_length": 418.375, "completions/min_length": 394.0, "epoch": 4.035294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.024605918675661087, "kl": 0.00476845377124846, "learning_rate": 9.687839832467277e-07, "loss": 4.7601573896827176e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/mean_length": 502.5, "completions/min_length": 416.0, "epoch": 4.036764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.074770212173462, "kl": 0.006214754423126578, "learning_rate": 9.687393333062354e-07, "loss": 6.183981895446777e-05, "reward": 0.831166684627533, "reward_std": 0.1770123839378357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8019791841506958, "rewards/DrugCombAccuracyCOTORM/std": 0.23771226406097412, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.13437096774578094, "step": 2745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 403.25, "completions/min_length": 335.0, "epoch": 4.038235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9154734015464783, "kl": 0.007145972573198378, "learning_rate": 9.68694652486382e-07, "loss": 7.184395508375019e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 473.625, "completions/min_length": 413.0, "epoch": 4.0397058823529415, "frac_reward_zero_std": 0.0, "grad_norm": 1.3201861381530762, "kl": 0.005801078397780657, "learning_rate": 9.686499407901109e-07, "loss": 5.836784839630127e-05, "reward": 0.824999988079071, "reward_std": 0.37287637591362, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 2747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 463.25, "completions/min_length": 394.0, "epoch": 4.041176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.1646860837936401, "kl": 0.006168059189803898, "learning_rate": 9.686051982203678e-07, "loss": 6.130158726591617e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/mean_length": 514.75, "completions/min_length": 425.0, "epoch": 4.04264705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.2365734577178955, "kl": 0.008125663036480546, "learning_rate": 9.685604247801001e-07, "loss": 8.030980825424194e-05, "reward": 0.49896955490112305, "reward_std": 0.23683370649814606, "rewards/DrugCombAccuracyCOTORM/mean": 0.4143369495868683, "rewards/DrugCombAccuracyCOTORM/std": 0.4729776382446289, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.675000011920929, "rewards/DrugCombCoverageCOTORM/std": 0.5026596188545227, "step": 2749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 446.5, "completions/min_length": 326.0, "epoch": 4.044117647058823, "frac_reward_zero_std": 0.0, "grad_norm": 1.6130571365356445, "kl": 0.006386509980075061, "learning_rate": 9.685156204722573e-07, "loss": 6.349384784698486e-05, "reward": 0.6353750228881836, "reward_std": 0.18750114738941193, "rewards/DrugCombAccuracyCOTORM/mean": 0.5715625286102295, "rewards/DrugCombAccuracyCOTORM/std": 0.3958818018436432, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.78125, "rewards/DrugCombCoverageCOTORM/std": 0.4857935607433319, "step": 2750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 418.3125, "completions/min_length": 377.0, "epoch": 4.045588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.073852300643921, "kl": 0.008365441113710403, "learning_rate": 9.684707852997911e-07, "loss": 8.500856347382069e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 2751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/mean_length": 480.625, "completions/min_length": 440.0, "epoch": 4.047058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.1443113088607788, "kl": 0.005845957435667515, "learning_rate": 9.684259192656552e-07, "loss": 5.83454966545105e-05, "reward": 0.5062500238418579, "reward_std": 0.0176776684820652, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0625, "rewards/DrugCombCoverageCOTORM/std": 0.9979145526885986, "step": 2752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/mean_length": 504.0625, "completions/min_length": 401.0, "epoch": 4.048529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.026341199874878, "kl": 0.00727723038289696, "learning_rate": 9.683810223728053e-07, "loss": 7.198207458714023e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/mean_length": 485.9375, "completions/min_length": 405.0, "epoch": 4.05, "frac_reward_zero_std": 1.0, "grad_norm": 0.011788061819970608, "kl": 0.004550375044345856, "learning_rate": 9.683360946241987e-07, "loss": 4.567948781186715e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 446.6875, "completions/min_length": 382.0, "epoch": 4.051470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.1915851831436157, "kl": 0.007668568403460085, "learning_rate": 9.682911360227957e-07, "loss": 7.648937753401697e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 449.4375, "completions/min_length": 384.0, "epoch": 4.052941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9064172506332397, "kl": 0.008196127251721919, "learning_rate": 9.682461465715575e-07, "loss": 8.201367745641619e-05, "reward": 0.9270833134651184, "reward_std": 0.1593482345342636, "rewards/DrugCombAccuracyCOTORM/mean": 0.9166666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 444.0, "completions/min_length": 364.0, "epoch": 4.054411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.1137851476669312, "kl": 0.0067316265776753426, "learning_rate": 9.682011262734482e-07, "loss": 6.763034616596997e-05, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 2757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 446.625, "completions/min_length": 372.0, "epoch": 4.055882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.8390690684318542, "kl": 0.004884555586613715, "learning_rate": 9.681560751314338e-07, "loss": 4.871671990258619e-05, "reward": 0.75, "reward_std": 0.20701967179775238, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 445.5625, "completions/min_length": 403.0, "epoch": 4.057352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.9043441414833069, "kl": 0.005892504355870187, "learning_rate": 9.681109931484818e-07, "loss": 5.885213613510132e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/mean_length": 466.75, "completions/min_length": 398.0, "epoch": 4.0588235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 1.0669611692428589, "kl": 0.005455899750813842, "learning_rate": 9.680658803275618e-07, "loss": 5.439714004751295e-05, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/mean_length": 488.125, "completions/min_length": 401.0, "epoch": 4.060294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9524519443511963, "kl": 0.004389244713820517, "learning_rate": 9.680207366716467e-07, "loss": 4.3805302993860096e-05, "reward": 0.7441666722297668, "reward_std": 0.1262820065021515, "rewards/DrugCombAccuracyCOTORM/mean": 0.690625011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.38549479842185974, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.08606630563735962, "step": 2761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/mean_length": 425.9375, "completions/min_length": 384.0, "epoch": 4.061764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.009319848380982876, "kl": 0.004819761379621923, "learning_rate": 9.679755621837095e-07, "loss": 4.8337067710235715e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 398.3125, "completions/min_length": 363.0, "epoch": 4.063235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.02988496981561184, "kl": 0.006041855784133077, "learning_rate": 9.679303568667268e-07, "loss": 6.054359255358577e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 447.375, "completions/min_length": 395.0, "epoch": 4.064705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9760624170303345, "kl": 0.005529803456738591, "learning_rate": 9.678851207236763e-07, "loss": 5.519425394595601e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 454.625, "completions/min_length": 373.0, "epoch": 4.0661764705882355, "frac_reward_zero_std": 0.5, "grad_norm": 1.053713083267212, "kl": 0.005220880615524948, "learning_rate": 9.67839853757538e-07, "loss": 5.196070560486987e-05, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 2765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 488.0, "completions/min_length": 446.0, "epoch": 4.067647058823529, "frac_reward_zero_std": 0.0, "grad_norm": 1.6942100524902344, "kl": 0.008587953285314143, "learning_rate": 9.677945559712943e-07, "loss": 8.579716086387634e-05, "reward": 0.5057291984558105, "reward_std": 0.3988705575466156, "rewards/DrugCombAccuracyCOTORM/mean": 0.4270833432674408, "rewards/DrugCombAccuracyCOTORM/std": 0.4633563458919525, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.65625, "rewards/DrugCombCoverageCOTORM/std": 0.539096474647522, "step": 2766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 435.0625, "completions/min_length": 349.0, "epoch": 4.069117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.2563518285751343, "kl": 0.006568029930349439, "learning_rate": 9.67749227367929e-07, "loss": 6.700257654301822e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 476.0, "completions/min_length": 403.0, "epoch": 4.070588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.0011042356491089, "kl": 0.005775471683591604, "learning_rate": 9.677038679504284e-07, "loss": 5.683696872438304e-05, "reward": 0.30000001192092896, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.1875, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 2768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 466.4375, "completions/min_length": 423.0, "epoch": 4.072058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.013024133630096912, "kl": 0.006032277829945087, "learning_rate": 9.676584777217804e-07, "loss": 6.011742880218662e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 423.1875, "completions/min_length": 360.0, "epoch": 4.073529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.5002652406692505, "kl": 0.004802762239705771, "learning_rate": 9.676130566849754e-07, "loss": 4.8547983169555664e-05, "reward": 0.8937499523162842, "reward_std": 0.3005203604698181, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 429.875, "completions/min_length": 386.0, "epoch": 4.075, "frac_reward_zero_std": 1.0, "grad_norm": 0.01700531505048275, "kl": 0.006002030451782048, "learning_rate": 9.675676048430058e-07, "loss": 5.982630682410672e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/mean_length": 496.8125, "completions/min_length": 422.0, "epoch": 4.076470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8754711747169495, "kl": 0.00462940102443099, "learning_rate": 9.675221221988656e-07, "loss": 4.570186138153076e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/mean_length": 479.5625, "completions/min_length": 391.0, "epoch": 4.077941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.686281442642212, "kl": 0.01472083164844662, "learning_rate": 9.67476608755551e-07, "loss": 0.00014888495206832886, "reward": 0.6625000238418579, "reward_std": 0.3919961452484131, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/mean_length": 480.0, "completions/min_length": 375.0, "epoch": 4.079411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9717657566070557, "kl": 0.004956976335961372, "learning_rate": 9.674310645160606e-07, "loss": 4.979223012924194e-05, "reward": 0.622083306312561, "reward_std": 0.11682217568159103, "rewards/DrugCombAccuracyCOTORM/mean": 0.5874999761581421, "rewards/DrugCombAccuracyCOTORM/std": 0.45734742283821106, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5208333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.6663193702697754, "step": 2774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 490.4375, "completions/min_length": 433.0, "epoch": 4.080882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.6873286366462708, "kl": 0.004921363666653633, "learning_rate": 9.673854894833945e-07, "loss": 4.859975160798058e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 506.125, "completions/min_length": 459.0, "epoch": 4.08235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.008232345804572105, "kl": 0.0049688550643622875, "learning_rate": 9.673398836605552e-07, "loss": 4.955066106049344e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 442.5625, "completions/min_length": 386.0, "epoch": 4.083823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.9405525922775269, "kl": 0.00611609173938632, "learning_rate": 9.672942470505471e-07, "loss": 6.124749779701233e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 2777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 469.875, "completions/min_length": 398.0, "epoch": 4.0852941176470585, "frac_reward_zero_std": 1.0, "grad_norm": 0.01262239646166563, "kl": 0.006094192969612777, "learning_rate": 9.672485796563766e-07, "loss": 6.0551763453986496e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 460.3125, "completions/min_length": 421.0, "epoch": 4.086764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0611252784729004, "kl": 0.005277901014778763, "learning_rate": 9.67202881481052e-07, "loss": 5.305558443069458e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 454.8125, "completions/min_length": 369.0, "epoch": 4.088235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.021264348179101944, "kl": 0.0063724780338816345, "learning_rate": 9.67157152527584e-07, "loss": 6.335604848572984e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 398.25, "completions/min_length": 317.0, "epoch": 4.089705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.010998783633112907, "kl": 0.005951371509581804, "learning_rate": 9.67111392798985e-07, "loss": 5.974427403998561e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 418.375, "completions/min_length": 373.0, "epoch": 4.091176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.030534910038113594, "kl": 0.007344331592321396, "learning_rate": 9.670656022982695e-07, "loss": 7.269077468663454e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/mean_length": 486.625, "completions/min_length": 377.0, "epoch": 4.0926470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.8047130703926086, "kl": 0.005100007285363972, "learning_rate": 9.670197810284543e-07, "loss": 5.161909939488396e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 2783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 400.0, "completions/min_length": 334.0, "epoch": 4.094117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.008882519789040089, "kl": 0.0044948175782337785, "learning_rate": 9.669739289925575e-07, "loss": 4.501121293287724e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/mean_length": 402.375, "completions/min_length": 371.0, "epoch": 4.095588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.0074766105972230434, "kl": 0.0043382185976952314, "learning_rate": 9.669280461936003e-07, "loss": 4.348670699982904e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 480.375, "completions/min_length": 433.0, "epoch": 4.097058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 1.1810581684112549, "kl": 0.0065026802476495504, "learning_rate": 9.66882132634605e-07, "loss": 6.469894287874922e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 414.4375, "completions/min_length": 382.0, "epoch": 4.098529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.2439168691635132, "kl": 0.006861429428681731, "learning_rate": 9.668361883185962e-07, "loss": 6.856769323348999e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 438.375, "completions/min_length": 390.0, "epoch": 4.1, "frac_reward_zero_std": 1.0, "grad_norm": 0.3571230471134186, "kl": 0.014257078059017658, "learning_rate": 9.667902132486008e-07, "loss": 0.00013809352822136134, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 467.0, "completions/min_length": 419.0, "epoch": 4.101470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.011236472986638546, "kl": 0.005890143685974181, "learning_rate": 9.667442074276474e-07, "loss": 5.912251799600199e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 489.4375, "completions/min_length": 441.0, "epoch": 4.102941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.027410168200731277, "kl": 0.0069440657971426845, "learning_rate": 9.66698170858767e-07, "loss": 6.967183435335755e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 446.6875, "completions/min_length": 385.0, "epoch": 4.104411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9768794178962708, "kl": 0.005505254142917693, "learning_rate": 9.66652103544992e-07, "loss": 5.503743886947632e-05, "reward": 0.831250011920929, "reward_std": 0.2344255894422531, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5439056158065796, "step": 2791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 459.0625, "completions/min_length": 413.0, "epoch": 4.105882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.1945905685424805, "kl": 0.0073147879447788, "learning_rate": 9.666060054893575e-07, "loss": 7.297040428966284e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/mean_length": 530.125, "completions/min_length": 473.0, "epoch": 4.107352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.9955691695213318, "kl": 0.004916193604003638, "learning_rate": 9.665598766949e-07, "loss": 4.9381931603420526e-05, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 2793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 443.75, "completions/min_length": 364.0, "epoch": 4.108823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.014834295958280563, "kl": 0.0062218771781772375, "learning_rate": 9.665137171646586e-07, "loss": 6.265746924327686e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 418.5, "completions/min_length": 378.0, "epoch": 4.110294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.445237398147583, "kl": 0.006009680451825261, "learning_rate": 9.664675269016741e-07, "loss": 5.9582293033599854e-05, "reward": 0.7749999761581421, "reward_std": 0.3919961452484131, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 2795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/mean_length": 399.25, "completions/min_length": 360.0, "epoch": 4.1117647058823525, "frac_reward_zero_std": 1.0, "grad_norm": 0.01719318889081478, "kl": 0.004485273268073797, "learning_rate": 9.664213059089895e-07, "loss": 4.493497544899583e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 422.9375, "completions/min_length": 377.0, "epoch": 4.113235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01624296046793461, "kl": 0.006655328324995935, "learning_rate": 9.663750541896495e-07, "loss": 6.647893314948305e-05, "reward": 0.27133333683013916, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.10999999940395355, "rewards/DrugCombAccuracyCOTORM/std": 0.1136075109243393, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 2797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/mean_length": 498.0625, "completions/min_length": 385.0, "epoch": 4.114705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.01494544092565775, "kl": 0.00481483933981508, "learning_rate": 9.663287717467015e-07, "loss": 4.792376785189845e-05, "reward": 0.800000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 500.25, "completions/min_length": 448.0, "epoch": 4.116176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.007989400997757912, "kl": 0.00509891624096781, "learning_rate": 9.662824585831937e-07, "loss": 5.116546162753366e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 457.25, "completions/min_length": 374.0, "epoch": 4.117647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.9156997203826904, "kl": 0.005145463976077735, "learning_rate": 9.66236114702178e-07, "loss": 5.132704973220825e-05, "reward": 0.9877333641052246, "reward_std": 0.03469536080956459, "rewards/DrugCombAccuracyCOTORM/mean": 0.9846667051315308, "rewards/DrugCombAccuracyCOTORM/std": 0.06133333221077919, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 466.9375, "completions/min_length": 395.0, "epoch": 4.1191176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.9501500129699707, "kl": 0.005709251505322754, "learning_rate": 9.661897401067065e-07, "loss": 5.6512653827667236e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 426.3125, "completions/min_length": 394.0, "epoch": 4.120588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.010603982023894787, "kl": 0.005462553235702217, "learning_rate": 9.66143334799835e-07, "loss": 5.4708219977328554e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/mean_length": 511.8125, "completions/min_length": 463.0, "epoch": 4.122058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.5225931406021118, "kl": 0.007646364625543356, "learning_rate": 9.6609689878462e-07, "loss": 7.691234350204468e-05, "reward": 0.36250001192092896, "reward_std": 0.3934735357761383, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 479.5, "completions/min_length": 419.0, "epoch": 4.123529411764705, "frac_reward_zero_std": 0.0, "grad_norm": 1.5289967060089111, "kl": 0.008113034185953438, "learning_rate": 9.660504320641212e-07, "loss": 8.056312799453735e-05, "reward": 0.5062500238418579, "reward_std": 0.3442630469799042, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 2804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 481.5625, "completions/min_length": 447.0, "epoch": 4.125, "frac_reward_zero_std": 0.0, "grad_norm": 1.2425222396850586, "kl": 0.005518460879102349, "learning_rate": 9.660039346413992e-07, "loss": 5.492940545082092e-05, "reward": 0.800000011920929, "reward_std": 0.3484410345554352, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 416.0, "completions/min_length": 324.0, "epoch": 4.126470588235295, "frac_reward_zero_std": 0.0, "grad_norm": 1.3036980628967285, "kl": 0.0057296184822916985, "learning_rate": 9.659574065195173e-07, "loss": 5.6896358728408813e-05, "reward": 0.78125, "reward_std": 0.3743184804916382, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 2806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 470.5625, "completions/min_length": 423.0, "epoch": 4.127941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.2865251302719116, "kl": 0.006280937697738409, "learning_rate": 9.659108477015406e-07, "loss": 6.277859210968018e-05, "reward": 0.6375000476837158, "reward_std": 0.1767767071723938, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 450.8125, "completions/min_length": 411.0, "epoch": 4.129411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.1500014066696167, "kl": 0.006128602719400078, "learning_rate": 9.658642581905363e-07, "loss": 6.189942359924316e-05, "reward": 0.737500011920929, "reward_std": 0.219983771443367, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 427.25, "completions/min_length": 377.0, "epoch": 4.1308823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.011561227962374687, "kl": 0.005926107172854245, "learning_rate": 9.658176379895738e-07, "loss": 5.840164521941915e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 468.0625, "completions/min_length": 410.0, "epoch": 4.132352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.0873069763183594, "kl": 0.00608060194645077, "learning_rate": 9.65770987101724e-07, "loss": 6.0768645198550075e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 2810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 438.6875, "completions/min_length": 391.0, "epoch": 4.133823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.007384978700429201, "kl": 0.004524405347183347, "learning_rate": 9.657243055300604e-07, "loss": 4.533831088338047e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 481.5625, "completions/min_length": 431.0, "epoch": 4.135294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.4882899522781372, "kl": 0.006397248129360378, "learning_rate": 9.65677593277658e-07, "loss": 6.4067542552948e-05, "reward": 0.6000000238418579, "reward_std": 0.2828426957130432, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 456.875, "completions/min_length": 399.0, "epoch": 4.136764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.3452001810073853, "kl": 0.008527740486897528, "learning_rate": 9.656308503475944e-07, "loss": 8.550286293029785e-05, "reward": 0.9833333492279053, "reward_std": 0.047140445560216904, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/mean_length": 492.5625, "completions/min_length": 402.0, "epoch": 4.1382352941176475, "frac_reward_zero_std": 0.5, "grad_norm": 0.8293486833572388, "kl": 0.006247125333175063, "learning_rate": 9.655840767429486e-07, "loss": 6.266683340072632e-05, "reward": 0.5975833535194397, "reward_std": 0.07148842513561249, "rewards/DrugCombAccuracyCOTORM/mean": 0.5412499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.4801371395587921, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6458333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.6607375144958496, "step": 2814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 432.375, "completions/min_length": 369.0, "epoch": 4.139705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.015080660581588745, "kl": 0.006750181666575372, "learning_rate": 9.655372724668022e-07, "loss": 6.758898234693334e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 431.4375, "completions/min_length": 369.0, "epoch": 4.141176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.012014524079859257, "kl": 0.0063891971949487925, "learning_rate": 9.654904375222384e-07, "loss": 6.384063453879207e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 2816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 467.0, "completions/min_length": 413.0, "epoch": 4.142647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.10608867555856705, "kl": 0.007409931917209178, "learning_rate": 9.654435719123424e-07, "loss": 7.507902046199888e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 471.0625, "completions/min_length": 422.0, "epoch": 4.144117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.01486627571284771, "kl": 0.006147461012005806, "learning_rate": 9.65396675640202e-07, "loss": 6.232090527191758e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/mean_length": 509.8125, "completions/min_length": 436.0, "epoch": 4.145588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.324700951576233, "kl": 0.006486388971097767, "learning_rate": 9.653497487089064e-07, "loss": 6.4849853515625e-05, "reward": 0.8957916498184204, "reward_std": 0.22740399837493896, "rewards/DrugCombAccuracyCOTORM/mean": 0.8814583420753479, "rewards/DrugCombAccuracyCOTORM/std": 0.26528245210647583, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.25069350004196167, "step": 2819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 498.1875, "completions/min_length": 436.0, "epoch": 4.147058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.8901631832122803, "kl": 0.005781431798823178, "learning_rate": 9.653027911215469e-07, "loss": 5.7352986914338544e-05, "reward": 0.5979166626930237, "reward_std": 0.005892557092010975, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 2820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 468.5625, "completions/min_length": 419.0, "epoch": 4.148529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.0450377464294434, "kl": 0.004943700623698533, "learning_rate": 9.65255802881217e-07, "loss": 5.018274168833159e-05, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 421.25, "completions/min_length": 359.0, "epoch": 4.15, "frac_reward_zero_std": 1.0, "grad_norm": 0.011892776004970074, "kl": 0.005395705113187432, "learning_rate": 9.652087839910123e-07, "loss": 5.411968595581129e-05, "reward": 0.8416666984558105, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.8333333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.17213258147239685, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 2822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 490.375, "completions/min_length": 421.0, "epoch": 4.151470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8493247628211975, "kl": 0.005448102136142552, "learning_rate": 9.651617344540302e-07, "loss": 5.433708429336548e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 2823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 462.75, "completions/min_length": 399.0, "epoch": 4.152941176470589, "frac_reward_zero_std": 0.0, "grad_norm": 1.448588490486145, "kl": 0.007566654065158218, "learning_rate": 9.651146542733702e-07, "loss": 7.555261254310608e-05, "reward": 0.7127083539962769, "reward_std": 0.31249645352363586, "rewards/DrugCombAccuracyCOTORM/mean": 0.6421874761581421, "rewards/DrugCombAccuracyCOTORM/std": 0.4814664125442505, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9895833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.041666675359010696, "step": 2824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 477.125, "completions/min_length": 413.0, "epoch": 4.154411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.582321286201477, "kl": 0.006853318423964083, "learning_rate": 9.650675434521339e-07, "loss": 6.918609142303467e-05, "reward": 0.699999988079071, "reward_std": 0.3484410345554352, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/mean_length": 410.625, "completions/min_length": 360.0, "epoch": 4.155882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.0208969097584486, "kl": 0.007726251264102757, "learning_rate": 9.650204019934245e-07, "loss": 7.663283031433821e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 455.125, "completions/min_length": 380.0, "epoch": 4.1573529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.0782501697540283, "kl": 0.005480501218698919, "learning_rate": 9.649732299003482e-07, "loss": 5.5070966482162476e-05, "reward": 0.84375, "reward_std": 0.21619683504104614, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 454.25, "completions/min_length": 410.0, "epoch": 4.158823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.024915099143982, "kl": 0.0063225001795217395, "learning_rate": 9.649260271760121e-07, "loss": 6.246638076845556e-05, "reward": 0.7437499761581421, "reward_std": 0.216884046792984, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.394405335187912, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.8732125163078308, "step": 2828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 439.5625, "completions/min_length": 386.0, "epoch": 4.160294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9776560068130493, "kl": 0.004569413140416145, "learning_rate": 9.648787938235256e-07, "loss": 4.586105205817148e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 423.0625, "completions/min_length": 374.0, "epoch": 4.161764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0937483310699463, "kl": 0.0089611797593534, "learning_rate": 9.64831529846001e-07, "loss": 9.069964289665222e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 429.75, "completions/min_length": 375.0, "epoch": 4.163235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.030794698745012283, "kl": 0.006498795002698898, "learning_rate": 9.647842352465515e-07, "loss": 6.436332478187978e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 2831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 467.875, "completions/min_length": 362.0, "epoch": 4.1647058823529415, "frac_reward_zero_std": 0.5, "grad_norm": 0.7658097743988037, "kl": 0.005718638596590608, "learning_rate": 9.647369100282927e-07, "loss": 5.740998312830925e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 493.6875, "completions/min_length": 393.0, "epoch": 4.166176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9462056756019592, "kl": 0.005605318350717425, "learning_rate": 9.646895541943425e-07, "loss": 5.5639477068325505e-05, "reward": 0.5625, "reward_std": 0.051754921674728394, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 2833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 438.5, "completions/min_length": 388.0, "epoch": 4.16764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.009713741019368172, "kl": 0.005569864879362285, "learning_rate": 9.646421677478204e-07, "loss": 5.534344745683484e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/mean_length": 432.6875, "completions/min_length": 405.0, "epoch": 4.169117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.01563239097595215, "kl": 0.00569123012246564, "learning_rate": 9.64594750691848e-07, "loss": 5.6853652495192364e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 493.6875, "completions/min_length": 416.0, "epoch": 4.170588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.3802474737167358, "kl": 0.004669079033192247, "learning_rate": 9.645473030295494e-07, "loss": 4.717707633972168e-05, "reward": 0.7964791655540466, "reward_std": 0.2865186929702759, "rewards/DrugCombAccuracyCOTORM/mean": 0.7579687833786011, "rewards/DrugCombAccuracyCOTORM/std": 0.37081801891326904, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9010416865348816, "rewards/DrugCombCoverageCOTORM/std": 0.15280933678150177, "step": 2836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 449.5, "completions/min_length": 406.0, "epoch": 4.172058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.0585250854492188, "kl": 0.0060223296750336885, "learning_rate": 9.644998247640498e-07, "loss": 6.0304999351501465e-05, "reward": 0.5011666417121887, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.4025000035762787, "rewards/DrugCombAccuracyCOTORM/std": 0.4833701252937317, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.1666666567325592, "step": 2837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 410.0625, "completions/min_length": 355.0, "epoch": 4.173529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.008688547648489475, "kl": 0.004925309563986957, "learning_rate": 9.644523158984777e-07, "loss": 4.956770862918347e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 484.75, "completions/min_length": 408.0, "epoch": 4.175, "frac_reward_zero_std": 0.5, "grad_norm": 0.8797678351402283, "kl": 0.005138500942848623, "learning_rate": 9.644047764359621e-07, "loss": 5.1422190153971314e-05, "reward": 0.543749988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 424.375, "completions/min_length": 380.0, "epoch": 4.176470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.024300381541252136, "kl": 0.005572411697357893, "learning_rate": 9.64357206379635e-07, "loss": 5.592286106548272e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 493.125, "completions/min_length": 428.0, "epoch": 4.177941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9030099511146545, "kl": 0.006296185893006623, "learning_rate": 9.643096057326306e-07, "loss": 6.301559187704697e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 463.125, "completions/min_length": 405.0, "epoch": 4.179411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.9870181083679199, "kl": 0.005635101115331054, "learning_rate": 9.642619744980841e-07, "loss": 5.636002606479451e-05, "reward": 0.893750011920929, "reward_std": 0.1971900761127472, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 429.1875, "completions/min_length": 399.0, "epoch": 4.180882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.01346895843744278, "kl": 0.005806806730106473, "learning_rate": 9.642143126791337e-07, "loss": 5.829639485455118e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 410.0, "completions/min_length": 301.0, "epoch": 4.182352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.011131075210869312, "kl": 0.005567661370150745, "learning_rate": 9.641666202789193e-07, "loss": 5.596222763415426e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 467.3125, "completions/min_length": 396.0, "epoch": 4.1838235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.0058278427459299564, "kl": 0.00350369606167078, "learning_rate": 9.641188973005824e-07, "loss": 3.5138451494276524e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 2845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 461.0625, "completions/min_length": 422.0, "epoch": 4.185294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.0099711418151855, "kl": 0.006043624482117593, "learning_rate": 9.640711437472673e-07, "loss": 5.9965343098156154e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 2846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 452.8125, "completions/min_length": 407.0, "epoch": 4.186764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.1244860887527466, "kl": 0.007596956565976143, "learning_rate": 9.640233596221196e-07, "loss": 7.556378841400146e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 438.25, "completions/min_length": 352.0, "epoch": 4.188235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.3774821758270264, "kl": 0.007064815727062523, "learning_rate": 9.639755449282874e-07, "loss": 7.076188921928406e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 418.1875, "completions/min_length": 382.0, "epoch": 4.189705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.011645013466477394, "kl": 0.00598133762832731, "learning_rate": 9.639276996689201e-07, "loss": 5.950389459030703e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 446.125, "completions/min_length": 393.0, "epoch": 4.1911764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.011300161480903625, "kl": 0.005573649192228913, "learning_rate": 9.638798238471703e-07, "loss": 5.557433178182691e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 383.5, "completions/min_length": 295.0, "epoch": 4.192647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.006008327938616276, "kl": 0.004013178404420614, "learning_rate": 9.638319174661915e-07, "loss": 3.9844864659244195e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 450.4375, "completions/min_length": 416.0, "epoch": 4.194117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.010361925698816776, "kl": 0.0054965418530628085, "learning_rate": 9.6378398052914e-07, "loss": 5.4907588491914794e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 468.0625, "completions/min_length": 407.0, "epoch": 4.195588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.8289005756378174, "kl": 0.00576980080222711, "learning_rate": 9.637360130391732e-07, "loss": 5.748084731749259e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 422.1875, "completions/min_length": 395.0, "epoch": 4.197058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.1803202629089355, "kl": 0.008120082085952163, "learning_rate": 9.636880149994518e-07, "loss": 8.150145004037768e-05, "reward": 0.7749999761581421, "reward_std": 0.24348656833171844, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 2854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 423.5625, "completions/min_length": 374.0, "epoch": 4.198529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.014403235167264938, "kl": 0.005862277641426772, "learning_rate": 9.636399864131373e-07, "loss": 5.816008342662826e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 459.5625, "completions/min_length": 399.0, "epoch": 4.2, "frac_reward_zero_std": 1.0, "grad_norm": 0.07896938920021057, "kl": 0.005036062095314264, "learning_rate": 9.635919272833937e-07, "loss": 5.098168185213581e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 467.9375, "completions/min_length": 376.0, "epoch": 4.201470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.2726341485977173, "kl": 0.0056342974421568215, "learning_rate": 9.63543837613387e-07, "loss": 5.556643009185791e-05, "reward": 0.8296874761581421, "reward_std": 0.37182796001434326, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 2857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 424.25, "completions/min_length": 371.0, "epoch": 4.202941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.006596730090677738, "kl": 0.0040139080374501646, "learning_rate": 9.634957174062857e-07, "loss": 3.995843144366518e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 405.3125, "completions/min_length": 343.0, "epoch": 4.204411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.018638450652360916, "kl": 0.006281706679146737, "learning_rate": 9.634475666652594e-07, "loss": 6.273282633628696e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 474.4375, "completions/min_length": 398.0, "epoch": 4.205882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.9086681604385376, "kl": 0.006424319464713335, "learning_rate": 9.633993853934802e-07, "loss": 6.418675184249878e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 451.875, "completions/min_length": 406.0, "epoch": 4.20735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.010702282190322876, "kl": 0.005647451616823673, "learning_rate": 9.633511735941222e-07, "loss": 5.686421718564816e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 405.6875, "completions/min_length": 351.0, "epoch": 4.208823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.038706012070178986, "kl": 0.006403245963156223, "learning_rate": 9.633029312703616e-07, "loss": 6.442424637498334e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/mean_length": 472.0, "completions/min_length": 371.0, "epoch": 4.2102941176470585, "frac_reward_zero_std": 0.0, "grad_norm": 1.3883483409881592, "kl": 0.00615171215031296, "learning_rate": 9.632546584253761e-07, "loss": 6.199255585670471e-05, "reward": 0.7244791388511658, "reward_std": 0.32573202252388, "rewards/DrugCombAccuracyCOTORM/mean": 0.6770833730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.4323439300060272, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.3010398745536804, "step": 2863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 516.875, "completions/min_length": 446.0, "epoch": 4.211764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.4672225713729858, "kl": 0.006942882435396314, "learning_rate": 9.632063550623463e-07, "loss": 6.929785013198853e-05, "reward": 0.5445833206176758, "reward_std": 0.37712085247039795, "rewards/DrugCombAccuracyCOTORM/mean": 0.4437500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.4539732038974762, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.15957117080688477, "step": 2864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 472.9375, "completions/min_length": 400.0, "epoch": 4.213235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1351287364959717, "kl": 0.006766416714526713, "learning_rate": 9.63158021184454e-07, "loss": 6.685906555503607e-05, "reward": 0.6833333373069763, "reward_std": 0.1583646684885025, "rewards/DrugCombAccuracyCOTORM/mean": 0.6041666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.4901813864707947, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 461.625, "completions/min_length": 393.0, "epoch": 4.214705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9406794905662537, "kl": 0.015543107991106808, "learning_rate": 9.631096567948833e-07, "loss": 0.00016342848539352417, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 2866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 495.25, "completions/min_length": 367.0, "epoch": 4.216176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.0160679817199707, "kl": 0.006751290871761739, "learning_rate": 9.630612618968204e-07, "loss": 6.757676601409912e-05, "reward": 0.7000000476837158, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 506.6875, "completions/min_length": 475.0, "epoch": 4.2176470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.0471761226654053, "kl": 0.004821474431082606, "learning_rate": 9.630128364934536e-07, "loss": 4.8170921218115836e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 503.75, "completions/min_length": 445.0, "epoch": 4.219117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.9527686834335327, "kl": 0.006124028470367193, "learning_rate": 9.629643805879727e-07, "loss": 6.093457341194153e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 447.5, "completions/min_length": 389.0, "epoch": 4.220588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9620295166969299, "kl": 0.00608469033613801, "learning_rate": 9.6291589418357e-07, "loss": 6.079276499804109e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 443.75, "completions/min_length": 385.0, "epoch": 4.222058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 1.3505637645721436, "kl": 0.005613082670606673, "learning_rate": 9.628673772834397e-07, "loss": 5.608223000308499e-05, "reward": 0.8500000238418579, "reward_std": 0.2070196568965912, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/mean_length": 461.625, "completions/min_length": 390.0, "epoch": 4.223529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.0549646615982056, "kl": 0.005860382923856378, "learning_rate": 9.62818829890778e-07, "loss": 5.914692519581877e-05, "reward": 0.8558710813522339, "reward_std": 0.07520861178636551, "rewards/DrugCombAccuracyCOTORM/mean": 0.8341617584228516, "rewards/DrugCombAccuracyCOTORM/std": 0.21084456145763397, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8854166269302368, "rewards/DrugCombCoverageCOTORM/std": 0.19924628734588623, "step": 2872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 405.375, "completions/min_length": 355.0, "epoch": 4.225, "frac_reward_zero_std": 0.5, "grad_norm": 1.1806690692901611, "kl": 0.007888188934884965, "learning_rate": 9.62770252008783e-07, "loss": 7.859617471694946e-05, "reward": 0.7734375, "reward_std": 0.2422400414943695, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 2873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 443.8125, "completions/min_length": 392.0, "epoch": 4.226470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.008840844966471195, "kl": 0.005535171250812709, "learning_rate": 9.62721643640655e-07, "loss": 5.538772529689595e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 399.9375, "completions/min_length": 373.0, "epoch": 4.227941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.4005746841430664, "kl": 0.005364192766137421, "learning_rate": 9.62673004789596e-07, "loss": 5.346859688870609e-05, "reward": 0.75, "reward_std": 0.26726123690605164, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 2875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/mean_length": 503.0625, "completions/min_length": 445.0, "epoch": 4.229411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.0438461303710938, "kl": 0.006033318059053272, "learning_rate": 9.626243354588106e-07, "loss": 5.965656600892544e-05, "reward": 0.7009999752044678, "reward_std": 0.18456745147705078, "rewards/DrugCombAccuracyCOTORM/mean": 0.6301562786102295, "rewards/DrugCombAccuracyCOTORM/std": 0.4935082495212555, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 2876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 508.5625, "completions/min_length": 422.0, "epoch": 4.230882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 1.6338176727294922, "kl": 0.00854922051075846, "learning_rate": 9.625756356515044e-07, "loss": 8.657574653625488e-05, "reward": 0.6099796891212463, "reward_std": 0.2977760434150696, "rewards/DrugCombAccuracyCOTORM/mean": 0.5416412949562073, "rewards/DrugCombAccuracyCOTORM/std": 0.4774332344532013, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7666666507720947, "rewards/DrugCombCoverageCOTORM/std": 0.3098386824131012, "step": 2877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 477.3125, "completions/min_length": 423.0, "epoch": 4.232352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.7858670353889465, "kl": 0.006468962179496884, "learning_rate": 9.625269053708863e-07, "loss": 6.46534317638725e-05, "reward": 0.831250011920929, "reward_std": 0.10415475070476532, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 477.8125, "completions/min_length": 412.0, "epoch": 4.233823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.012714269571006298, "kl": 0.005287801148369908, "learning_rate": 9.62478144620166e-07, "loss": 5.275792864267714e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/mean_length": 515.5, "completions/min_length": 427.0, "epoch": 4.235294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.919755756855011, "kl": 0.0060714936116710305, "learning_rate": 9.624293534025557e-07, "loss": 6.098585436120629e-05, "reward": 0.6240358352661133, "reward_std": 0.11242285370826721, "rewards/DrugCombAccuracyCOTORM/mean": 0.5745326280593872, "rewards/DrugCombAccuracyCOTORM/std": 0.46899300813674927, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6597222089767456, "rewards/DrugCombCoverageCOTORM/std": 0.5385833978652954, "step": 2880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 477.6875, "completions/min_length": 400.0, "epoch": 4.2367647058823525, "frac_reward_zero_std": 0.5, "grad_norm": 1.1233395338058472, "kl": 0.008746253908611834, "learning_rate": 9.623805317212702e-07, "loss": 8.54695972520858e-05, "reward": 0.7875000238418579, "reward_std": 0.14900465309619904, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.3333333432674408, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/mean_length": 569.875, "completions/min_length": 416.0, "epoch": 4.238235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.968000054359436, "kl": 0.008292759419418871, "learning_rate": 9.623316795795252e-07, "loss": 8.2431361079216e-05, "reward": 0.7171379327774048, "reward_std": 0.07588480412960052, "rewards/DrugCombAccuracyCOTORM/mean": 0.6611359119415283, "rewards/DrugCombAccuracyCOTORM/std": 0.37911808490753174, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8822916746139526, "rewards/DrugCombCoverageCOTORM/std": 0.2818897068500519, "step": 2882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 434.25, "completions/min_length": 350.0, "epoch": 4.239705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9545942544937134, "kl": 0.005490898503921926, "learning_rate": 9.622827969805393e-07, "loss": 5.421130117611028e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 430.6875, "completions/min_length": 376.0, "epoch": 4.241176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.2430814504623413, "kl": 0.0063164676539599895, "learning_rate": 9.622338839275325e-07, "loss": 6.286175630521029e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 444.875, "completions/min_length": 357.0, "epoch": 4.242647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.8301644921302795, "kl": 0.006324705434963107, "learning_rate": 9.621849404237273e-07, "loss": 6.34722673567012e-05, "reward": 0.675000011920929, "reward_std": 0.20528726279735565, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 2885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 476.75, "completions/min_length": 409.0, "epoch": 4.2441176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.6400684118270874, "kl": 0.008838314679451287, "learning_rate": 9.621359664723478e-07, "loss": 8.874759078025818e-05, "reward": 0.7312500476837158, "reward_std": 0.41806113719940186, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 2886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 446.3125, "completions/min_length": 377.0, "epoch": 4.245588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.0663633346557617, "kl": 0.006397988763637841, "learning_rate": 9.620869620766205e-07, "loss": 6.410107016563416e-05, "reward": 0.7875000238418579, "reward_std": 0.2295181304216385, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 465.1875, "completions/min_length": 428.0, "epoch": 4.247058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.008507403545081615, "kl": 0.005400300142355263, "learning_rate": 9.620379272397733e-07, "loss": 5.389192665461451e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 453.875, "completions/min_length": 393.0, "epoch": 4.248529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.0197051763534546, "kl": 0.006576590589247644, "learning_rate": 9.61988861965037e-07, "loss": 6.571848643943667e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/mean_length": 554.0625, "completions/min_length": 477.0, "epoch": 4.25, "frac_reward_zero_std": 0.5, "grad_norm": 1.1853965520858765, "kl": 0.00889200100209564, "learning_rate": 9.619397662556433e-07, "loss": 9.025174949783832e-05, "reward": 0.7645833492279053, "reward_std": 0.22261746227741241, "rewards/DrugCombAccuracyCOTORM/mean": 0.7291666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.4425306022167206, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5439056158065796, "step": 2890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/mean_length": 484.3125, "completions/min_length": 413.0, "epoch": 4.251470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.8947020173072815, "kl": 0.005316535825841129, "learning_rate": 9.61890640114827e-07, "loss": 5.3284180467016995e-05, "reward": 0.8295833468437195, "reward_std": 0.21131062507629395, "rewards/DrugCombAccuracyCOTORM/mean": 0.8208333253860474, "rewards/DrugCombAccuracyCOTORM/std": 0.3414294719696045, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7291666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.6800735592842102, "step": 2891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/mean_length": 395.3125, "completions/min_length": 357.0, "epoch": 4.252941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.007408069912344217, "kl": 0.004684810759499669, "learning_rate": 9.61841483545824e-07, "loss": 4.670735143008642e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 424.5625, "completions/min_length": 348.0, "epoch": 4.254411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.008670071139931679, "kl": 0.005002415680792183, "learning_rate": 9.617922965518731e-07, "loss": 5.0306000048294663e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 2893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 433.6875, "completions/min_length": 363.0, "epoch": 4.2558823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.7920899987220764, "kl": 0.004879792802967131, "learning_rate": 9.617430791362145e-07, "loss": 4.861503839492798e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 437.0625, "completions/min_length": 401.0, "epoch": 4.257352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.7763972282409668, "kl": 0.00638080679345876, "learning_rate": 9.6169383130209e-07, "loss": 6.447626219596714e-05, "reward": 0.23750001192092896, "reward_std": 0.1505940705537796, "rewards/DrugCombAccuracyCOTORM/mean": 0.0625, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 447.1875, "completions/min_length": 370.0, "epoch": 4.258823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.9791854619979858, "kl": 0.006071529584005475, "learning_rate": 9.616445530527447e-07, "loss": 6.0535967350006104e-05, "reward": 0.6327500343322754, "reward_std": 0.023334523662924767, "rewards/DrugCombAccuracyCOTORM/mean": 0.5721874833106995, "rewards/DrugCombAccuracyCOTORM/std": 0.44363635778427124, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 2896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 463.5, "completions/min_length": 406.0, "epoch": 4.260294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.0257331132888794, "kl": 0.007607856881804764, "learning_rate": 9.615952443914242e-07, "loss": 7.551908493041992e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 427.6875, "completions/min_length": 311.0, "epoch": 4.261764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.0116318603977561, "kl": 0.005619498202577233, "learning_rate": 9.615459053213772e-07, "loss": 5.614530891762115e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 446.4375, "completions/min_length": 397.0, "epoch": 4.2632352941176475, "frac_reward_zero_std": 0.0, "grad_norm": 1.5656609535217285, "kl": 0.00628604379016906, "learning_rate": 9.614965358458541e-07, "loss": 6.319954991340637e-05, "reward": 0.4000000059604645, "reward_std": 0.34844106435775757, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 458.75, "completions/min_length": 426.0, "epoch": 4.264705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.015930011868476868, "kl": 0.007023853133432567, "learning_rate": 9.614471359681072e-07, "loss": 7.017661118879914e-05, "reward": 0.6865000128746033, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6237499713897705, "rewards/DrugCombAccuracyCOTORM/std": 0.38858935236930847, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.12909944355487823, "step": 2900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 425.25, "completions/min_length": 374.0, "epoch": 4.266176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.007915489375591278, "kl": 0.005656519555486739, "learning_rate": 9.613977056913905e-07, "loss": 5.6815009884303436e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 447.875, "completions/min_length": 386.0, "epoch": 4.267647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.9247335195541382, "kl": 0.00607708259485662, "learning_rate": 9.613482450189606e-07, "loss": 6.0861788369948044e-05, "reward": 0.925000011920929, "reward_std": 0.1752549111843109, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 2902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 415.25, "completions/min_length": 364.0, "epoch": 4.269117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.010533133521676064, "kl": 0.0046649392461404204, "learning_rate": 9.61298753954076e-07, "loss": 4.6679961087647825e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 489.375, "completions/min_length": 393.0, "epoch": 4.270588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.738235592842102, "kl": 0.0077277933014556766, "learning_rate": 9.612492324999969e-07, "loss": 7.674843072891235e-05, "reward": 0.7978000044822693, "reward_std": 0.3041299879550934, "rewards/DrugCombAccuracyCOTORM/mean": 0.7534999847412109, "rewards/DrugCombAccuracyCOTORM/std": 0.3824995160102844, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.949999988079071, "rewards/DrugCombCoverageCOTORM/std": 0.08944271504878998, "step": 2904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 442.875, "completions/min_length": 405.0, "epoch": 4.272058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.016221055760979652, "kl": 0.006508924067020416, "learning_rate": 9.611996806599855e-07, "loss": 6.537092849612236e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 489.75, "completions/min_length": 399.0, "epoch": 4.273529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.1521406173706055, "kl": 0.006577892345376313, "learning_rate": 9.611500984373064e-07, "loss": 6.50439687888138e-05, "reward": 0.5427083373069763, "reward_std": 0.03630708530545235, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4270833432674408, "rewards/DrugCombCoverageCOTORM/std": 0.7721272706985474, "step": 2906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/mean_length": 465.125, "completions/min_length": 366.0, "epoch": 4.275, "frac_reward_zero_std": 0.5, "grad_norm": 1.2375251054763794, "kl": 0.004910156596451998, "learning_rate": 9.61100485835226e-07, "loss": 4.900991916656494e-05, "reward": 0.9833333492279053, "reward_std": 0.047140445560216904, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/mean_length": 462.25, "completions/min_length": 372.0, "epoch": 4.276470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.1636059284210205, "kl": 0.007778747589327395, "learning_rate": 9.61050842857012e-07, "loss": 7.759034633636475e-05, "reward": 0.7093750238418579, "reward_std": 0.19346658885478973, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.42979326844215393, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.59375, "rewards/DrugCombCoverageCOTORM/std": 0.6884463429450989, "step": 2908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 423.1875, "completions/min_length": 369.0, "epoch": 4.277941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.008742384612560272, "kl": 0.005924961413256824, "learning_rate": 9.610011695059356e-07, "loss": 5.886472717975266e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 425.5, "completions/min_length": 402.0, "epoch": 4.279411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.0264352560043335, "kl": 0.00746500218519941, "learning_rate": 9.609514657852688e-07, "loss": 7.517635822296143e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 441.0, "completions/min_length": 373.0, "epoch": 4.280882352941177, "frac_reward_zero_std": 0.0, "grad_norm": 1.5476336479187012, "kl": 0.006339463172480464, "learning_rate": 9.60901731698286e-07, "loss": 6.349384784698486e-05, "reward": 0.637499988079071, "reward_std": 0.4001959264278412, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 2911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 444.1875, "completions/min_length": 386.0, "epoch": 4.2823529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.798486590385437, "kl": 0.005247456021606922, "learning_rate": 9.608519672482634e-07, "loss": 5.206465721130371e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 440.25, "completions/min_length": 382.0, "epoch": 4.283823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.016162084415555, "kl": 0.005445844319183379, "learning_rate": 9.608021724384795e-07, "loss": 5.50853437744081e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 485.6875, "completions/min_length": 402.0, "epoch": 4.285294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.4516483545303345, "kl": 0.0050802144687622786, "learning_rate": 9.607523472722146e-07, "loss": 5.0574541091918945e-05, "reward": 0.6406667232513428, "reward_std": 0.18711234629154205, "rewards/DrugCombAccuracyCOTORM/mean": 0.5508333444595337, "rewards/DrugCombAccuracyCOTORM/std": 0.4906647205352783, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 429.5, "completions/min_length": 372.0, "epoch": 4.286764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.023939160630106926, "kl": 0.007821742095984519, "learning_rate": 9.60702491752751e-07, "loss": 7.792632095515728e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 480.25, "completions/min_length": 417.0, "epoch": 4.288235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0237367153167725, "kl": 0.0070116708520799875, "learning_rate": 9.606526058833733e-07, "loss": 7.005035877227783e-05, "reward": 0.8725833296775818, "reward_std": 0.17491772770881653, "rewards/DrugCombAccuracyCOTORM/mean": 0.84333336353302, "rewards/DrugCombAccuracyCOTORM/std": 0.34016117453575134, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.05692751333117485, "step": 2916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 444.125, "completions/min_length": 385.0, "epoch": 4.2897058823529415, "frac_reward_zero_std": 0.5, "grad_norm": 0.8110727667808533, "kl": 0.004201505973469466, "learning_rate": 9.606026896673678e-07, "loss": 4.1911891457857564e-05, "reward": 0.4937500059604645, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 2917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 468.625, "completions/min_length": 377.0, "epoch": 4.291176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.7885528802871704, "kl": 0.005557048367336392, "learning_rate": 9.605527431080225e-07, "loss": 5.535408854484558e-05, "reward": 0.925000011920929, "reward_std": 0.14880476891994476, "rewards/DrugCombAccuracyCOTORM/mean": 0.90625, "rewards/DrugCombAccuracyCOTORM/std": 0.2719528079032898, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 448.9375, "completions/min_length": 358.0, "epoch": 4.29264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9341679215431213, "kl": 0.006516898749396205, "learning_rate": 9.60502766208628e-07, "loss": 6.500409654108807e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 470.4375, "completions/min_length": 395.0, "epoch": 4.294117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.01093920599669218, "kl": 0.00696201715618372, "learning_rate": 9.604527589724767e-07, "loss": 6.987643428146839e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 472.5625, "completions/min_length": 424.0, "epoch": 4.295588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.012867376208305359, "kl": 0.004969123401679099, "learning_rate": 9.60402721402863e-07, "loss": 4.95935273647774e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/mean_length": 485.625, "completions/min_length": 357.0, "epoch": 4.297058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.8555812835693359, "kl": 0.006985996733419597, "learning_rate": 9.60352653503083e-07, "loss": 6.895512342453003e-05, "reward": 0.4937500059604645, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 2922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 462.125, "completions/min_length": 387.0, "epoch": 4.298529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.022833220660686493, "kl": 0.00697714532725513, "learning_rate": 9.603025552764353e-07, "loss": 7.013871800154448e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 436.625, "completions/min_length": 386.0, "epoch": 4.3, "frac_reward_zero_std": 1.0, "grad_norm": 0.0071086878888309, "kl": 0.005250563146546483, "learning_rate": 9.602524267262202e-07, "loss": 5.290783155942336e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 426.25, "completions/min_length": 376.0, "epoch": 4.301470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.04494050145149231, "kl": 0.007901010336354375, "learning_rate": 9.602022678557398e-07, "loss": 7.90459816926159e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/mean_length": 392.6875, "completions/min_length": 350.0, "epoch": 4.302941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.010862993076443672, "kl": 0.006004055263474584, "learning_rate": 9.601520786682988e-07, "loss": 5.9967343986500055e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 463.8125, "completions/min_length": 392.0, "epoch": 4.304411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 1.403833270072937, "kl": 0.005884304759092629, "learning_rate": 9.601018591672032e-07, "loss": 5.933642387390137e-05, "reward": 0.6333333253860474, "reward_std": 0.18856181204319, "rewards/DrugCombAccuracyCOTORM/mean": 0.5416666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 471.125, "completions/min_length": 433.0, "epoch": 4.305882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.1975499391555786, "kl": 0.007506326423026621, "learning_rate": 9.600516093557615e-07, "loss": 7.620398537255824e-05, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/mean_length": 444.375, "completions/min_length": 355.0, "epoch": 4.307352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.01091875322163105, "kl": 0.005671904655173421, "learning_rate": 9.600013292372842e-07, "loss": 5.699564644601196e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 428.6875, "completions/min_length": 318.0, "epoch": 4.3088235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 1.1335432529449463, "kl": 0.006515114102512598, "learning_rate": 9.599510188150832e-07, "loss": 6.547994416905567e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 447.6875, "completions/min_length": 352.0, "epoch": 4.310294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9777655601501465, "kl": 0.0065689809853211045, "learning_rate": 9.599006780924732e-07, "loss": 6.583333015441895e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 2931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/mean_length": 517.875, "completions/min_length": 481.0, "epoch": 4.311764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8750053644180298, "kl": 0.003943738352973014, "learning_rate": 9.598503070727705e-07, "loss": 3.9526574255432934e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 488.4375, "completions/min_length": 440.0, "epoch": 4.313235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.0210411474108696, "kl": 0.006063557346351445, "learning_rate": 9.597999057592931e-07, "loss": 6.0263570048846304e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 416.6875, "completions/min_length": 345.0, "epoch": 4.314705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.01820860430598259, "kl": 0.006334249512292445, "learning_rate": 9.597494741553615e-07, "loss": 6.239194772206247e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/mean_length": 504.375, "completions/min_length": 369.0, "epoch": 4.3161764705882355, "frac_reward_zero_std": 0.5, "grad_norm": 1.0001271963119507, "kl": 0.005310707143507898, "learning_rate": 9.596990122642983e-07, "loss": 5.301274359226227e-05, "reward": 0.7875000238418579, "reward_std": 0.2295181304216385, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 463.4375, "completions/min_length": 412.0, "epoch": 4.317647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.0604209899902344, "kl": 0.005384786694776267, "learning_rate": 9.596485200894273e-07, "loss": 5.3476345783565193e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 496.5625, "completions/min_length": 395.0, "epoch": 4.319117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.2078542709350586, "kl": 0.00780603161547333, "learning_rate": 9.59597997634075e-07, "loss": 7.78393296059221e-05, "reward": 0.4113999903202057, "reward_std": 0.2016218900680542, "rewards/DrugCombAccuracyCOTORM/mean": 0.28299999237060547, "rewards/DrugCombAccuracyCOTORM/std": 0.4311436116695404, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8500000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.26832816004753113, "step": 2937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/mean_length": 534.625, "completions/min_length": 440.0, "epoch": 4.320588235294117, "frac_reward_zero_std": 0.0, "grad_norm": 1.5194826126098633, "kl": 0.006799026974476874, "learning_rate": 9.595474449015698e-07, "loss": 6.809085607528687e-05, "reward": 0.5583124756813049, "reward_std": 0.24788105487823486, "rewards/DrugCombAccuracyCOTORM/mean": 0.47132813930511475, "rewards/DrugCombAccuracyCOTORM/std": 0.49923598766326904, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5439056158065796, "step": 2938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 450.75, "completions/min_length": 404.0, "epoch": 4.322058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.9751812815666199, "kl": 0.0050084059475921094, "learning_rate": 9.594968618952419e-07, "loss": 5.0080096116289496e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 437.75, "completions/min_length": 388.0, "epoch": 4.323529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.0451582670211792, "kl": 0.005904391291551292, "learning_rate": 9.594462486184235e-07, "loss": 5.898850940866396e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 458.875, "completions/min_length": 394.0, "epoch": 4.325, "frac_reward_zero_std": 1.0, "grad_norm": 0.01836668699979782, "kl": 0.006622745539061725, "learning_rate": 9.593956050744492e-07, "loss": 6.597862375201657e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 436.1875, "completions/min_length": 373.0, "epoch": 4.326470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.010575809516012669, "kl": 0.005479701445437968, "learning_rate": 9.593449312666548e-07, "loss": 5.54338694200851e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 467.375, "completions/min_length": 360.0, "epoch": 4.327941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.013372907415032387, "kl": 0.007302732206881046, "learning_rate": 9.59294227198379e-07, "loss": 7.226431625895202e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 421.3125, "completions/min_length": 325.0, "epoch": 4.329411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.1648366451263428, "kl": 0.005344663979485631, "learning_rate": 9.592434928729615e-07, "loss": 5.36926272616256e-05, "reward": 0.75, "reward_std": 0.26726123690605164, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 2944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 497.4375, "completions/min_length": 430.0, "epoch": 4.330882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.445523738861084, "kl": 0.007336774608120322, "learning_rate": 9.59192728293745e-07, "loss": 7.322450255742297e-05, "reward": 0.875, "reward_std": 0.1035098284482956, "rewards/DrugCombAccuracyCOTORM/mean": 0.84375, "rewards/DrugCombAccuracyCOTORM/std": 0.23935678601264954, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/mean_length": 504.375, "completions/min_length": 392.0, "epoch": 4.33235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.7262859344482422, "kl": 0.0056309179635718465, "learning_rate": 9.591419334640737e-07, "loss": 5.613267421722412e-05, "reward": 0.5484374761581421, "reward_std": 0.0044194171205163, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 2946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 422.5625, "completions/min_length": 360.0, "epoch": 4.333823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.0006020069122314, "kl": 0.006227563368156552, "learning_rate": 9.590911083872937e-07, "loss": 6.243019743124023e-05, "reward": 0.25, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.0625, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 461.0625, "completions/min_length": 429.0, "epoch": 4.3352941176470585, "frac_reward_zero_std": 1.0, "grad_norm": 0.015632398426532745, "kl": 0.0061292542377486825, "learning_rate": 9.590402530667535e-07, "loss": 6.150405533844605e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 2948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/mean_length": 481.9375, "completions/min_length": 385.0, "epoch": 4.336764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.5832486152648926, "kl": 0.00833807303570211, "learning_rate": 9.589893675058029e-07, "loss": 8.295103907585144e-05, "reward": 0.550000011920929, "reward_std": 0.46579423546791077, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 2949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/mean_length": 389.25, "completions/min_length": 333.0, "epoch": 4.338235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.00849606841802597, "kl": 0.004752568667754531, "learning_rate": 9.589384517077944e-07, "loss": 4.796481516677886e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 435.625, "completions/min_length": 371.0, "epoch": 4.339705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.2113499641418457, "kl": 0.008598415763117373, "learning_rate": 9.588875056760821e-07, "loss": 8.52346420288086e-05, "reward": 0.7875000238418579, "reward_std": 0.2295181155204773, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 2951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 468.0625, "completions/min_length": 401.0, "epoch": 4.341176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.7906710505485535, "kl": 0.005858374992385507, "learning_rate": 9.588365294140222e-07, "loss": 5.821883678436279e-05, "reward": 0.8833333253860474, "reward_std": 0.14746536314487457, "rewards/DrugCombAccuracyCOTORM/mean": 0.8854166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.2770128548145294, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 2952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/mean_length": 406.5625, "completions/min_length": 373.0, "epoch": 4.3426470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.008644964545965195, "kl": 0.004945636144839227, "learning_rate": 9.587855229249731e-07, "loss": 4.915382305625826e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 480.6875, "completions/min_length": 401.0, "epoch": 4.344117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.1524752378463745, "kl": 0.012613029452040792, "learning_rate": 9.587344862122946e-07, "loss": 0.00012595951557159424, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 440.25, "completions/min_length": 379.0, "epoch": 4.345588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.018707290291786194, "kl": 0.007048980914987624, "learning_rate": 9.58683419279349e-07, "loss": 7.087378617143258e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/mean_length": 480.0625, "completions/min_length": 423.0, "epoch": 4.347058823529411, "frac_reward_zero_std": 0.0, "grad_norm": 1.456084966659546, "kl": 0.0064386839512735605, "learning_rate": 9.586323221295007e-07, "loss": 6.335973739624023e-05, "reward": 0.6312500238418579, "reward_std": 0.42117685079574585, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 2956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/mean_length": 476.6875, "completions/min_length": 408.0, "epoch": 4.348529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8525313138961792, "kl": 0.0051061484846286476, "learning_rate": 9.585811947661157e-07, "loss": 5.075335502624512e-05, "reward": 0.948437511920929, "reward_std": 0.14584076404571533, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.0625, "step": 2957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 447.9375, "completions/min_length": 408.0, "epoch": 4.35, "frac_reward_zero_std": 0.0, "grad_norm": 1.7047373056411743, "kl": 0.006966027547605336, "learning_rate": 9.58530037192562e-07, "loss": 6.99758529663086e-05, "reward": 0.5750000476837158, "reward_std": 0.2449311912059784, "rewards/DrugCombAccuracyCOTORM/mean": 0.46875, "rewards/DrugCombAccuracyCOTORM/std": 0.46435439586639404, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 449.25, "completions/min_length": 395.0, "epoch": 4.351470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.01407236885279417, "kl": 0.0053553509642370045, "learning_rate": 9.5847884941221e-07, "loss": 5.3535797633230686e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 453.0, "completions/min_length": 392.0, "epoch": 4.352941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.8272129893302917, "kl": 0.004703365848399699, "learning_rate": 9.584276314284314e-07, "loss": 4.688589979195967e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 2960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 444.3125, "completions/min_length": 411.0, "epoch": 4.354411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.01931794360280037, "kl": 0.008081911131739616, "learning_rate": 9.583763832446007e-07, "loss": 8.03883740445599e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 421.0625, "completions/min_length": 396.0, "epoch": 4.355882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.012089371681213379, "kl": 0.006960134371183813, "learning_rate": 9.583251048640939e-07, "loss": 6.9788409746252e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 2962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 463.5625, "completions/min_length": 387.0, "epoch": 4.357352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.012191091664135456, "kl": 0.0056896747555583715, "learning_rate": 9.58273796290289e-07, "loss": 5.7060864492086694e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/mean_length": 496.75, "completions/min_length": 438.0, "epoch": 4.358823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.000975489616394, "kl": 0.006559668108820915, "learning_rate": 9.582224575265663e-07, "loss": 6.44288957118988e-05, "reward": 0.8306608200073242, "reward_std": 0.04256724193692207, "rewards/DrugCombAccuracyCOTORM/mean": 0.8095499873161316, "rewards/DrugCombAccuracyCOTORM/std": 0.20587298274040222, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8302083015441895, "rewards/DrugCombCoverageCOTORM/std": 0.24752245843410492, "step": 2964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 437.9375, "completions/min_length": 380.0, "epoch": 4.360294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.885198712348938, "kl": 0.004862273810431361, "learning_rate": 9.581710885763076e-07, "loss": 4.860013723373413e-05, "reward": 0.9750000238418579, "reward_std": 0.0707106739282608, "rewards/DrugCombAccuracyCOTORM/mean": 0.96875, "rewards/DrugCombAccuracyCOTORM/std": 0.125, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 445.0, "completions/min_length": 406.0, "epoch": 4.3617647058823525, "frac_reward_zero_std": 1.0, "grad_norm": 0.01401072833687067, "kl": 0.005420624162070453, "learning_rate": 9.581196894428972e-07, "loss": 5.4302377975545824e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 443.125, "completions/min_length": 402.0, "epoch": 4.363235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.7732528448104858, "kl": 0.0059554571053013206, "learning_rate": 9.580682601297208e-07, "loss": 5.902483098907396e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 2967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 449.6875, "completions/min_length": 360.0, "epoch": 4.364705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.0066661834716797, "kl": 0.005211561685428023, "learning_rate": 9.580168006401667e-07, "loss": 5.22322952747345e-05, "reward": 0.8691666722297668, "reward_std": 0.1808096468448639, "rewards/DrugCombAccuracyCOTORM/mean": 0.846875011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.3294129967689514, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.18257419764995575, "step": 2968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 463.3125, "completions/min_length": 346.0, "epoch": 4.366176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 1.184005618095398, "kl": 0.005509197246283293, "learning_rate": 9.579653109776248e-07, "loss": 5.459785461425781e-05, "reward": 0.4714166522026062, "reward_std": 0.2019968330860138, "rewards/DrugCombAccuracyCOTORM/mean": 0.45124998688697815, "rewards/DrugCombAccuracyCOTORM/std": 0.502684473991394, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.10416668653488159, "rewards/DrugCombCoverageCOTORM/std": 1.0089874267578125, "step": 2969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 400.6875, "completions/min_length": 357.0, "epoch": 4.367647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.01135361846536398, "kl": 0.004982942948117852, "learning_rate": 9.579137911454874e-07, "loss": 4.9634789320407435e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 457.0625, "completions/min_length": 412.0, "epoch": 4.3691176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.9422470331192017, "kl": 0.005131920916028321, "learning_rate": 9.578622411471482e-07, "loss": 5.134780803928152e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 417.6875, "completions/min_length": 361.0, "epoch": 4.370588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.02639506570994854, "kl": 0.008728948538191617, "learning_rate": 9.578106609860032e-07, "loss": 8.719096513232216e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 2972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 454.6875, "completions/min_length": 406.0, "epoch": 4.372058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.1578344106674194, "kl": 0.006454066780861467, "learning_rate": 9.577590506654505e-07, "loss": 6.471201777458191e-05, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 2973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 421.4375, "completions/min_length": 376.0, "epoch": 4.373529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.008674579672515392, "kl": 0.005023913225159049, "learning_rate": 9.5770741018889e-07, "loss": 4.998356234864332e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 454.5625, "completions/min_length": 403.0, "epoch": 4.375, "frac_reward_zero_std": 1.0, "grad_norm": 0.005621600896120071, "kl": 0.0036284461966715753, "learning_rate": 9.576557395597236e-07, "loss": 3.611218198784627e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 483.3125, "completions/min_length": 434.0, "epoch": 4.376470588235295, "frac_reward_zero_std": 0.0, "grad_norm": 1.5418368577957153, "kl": 0.008067638496868312, "learning_rate": 9.57604038781355e-07, "loss": 8.06078314781189e-05, "reward": 0.59375, "reward_std": 0.3005203902721405, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 448.4375, "completions/min_length": 395.0, "epoch": 4.377941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9093868732452393, "kl": 0.006139751640148461, "learning_rate": 9.575523078571908e-07, "loss": 6.16402248851955e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 464.6875, "completions/min_length": 408.0, "epoch": 4.379411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.01290111243724823, "kl": 0.006367875263094902, "learning_rate": 9.575005467906384e-07, "loss": 6.403525912901387e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 452.375, "completions/min_length": 364.0, "epoch": 4.3808823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.013151987455785275, "kl": 0.00625241338275373, "learning_rate": 9.574487555851076e-07, "loss": 6.32867740932852e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 447.1875, "completions/min_length": 374.0, "epoch": 4.382352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.7107748985290527, "kl": 0.0076572534162551165, "learning_rate": 9.573969342440105e-07, "loss": 7.668137550354004e-05, "reward": 0.5375000238418579, "reward_std": 0.40996551513671875, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 2980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 413.25, "completions/min_length": 358.0, "epoch": 4.383823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.5196490287780762, "kl": 0.008074584067799151, "learning_rate": 9.573450827707608e-07, "loss": 7.980410009622574e-05, "reward": 0.7875000238418579, "reward_std": 0.22795677185058594, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 2981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 421.1875, "completions/min_length": 379.0, "epoch": 4.385294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9836259484291077, "kl": 0.005201616033446044, "learning_rate": 9.572932011687746e-07, "loss": 5.202126703807153e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/mean_length": 531.1875, "completions/min_length": 430.0, "epoch": 4.386764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.3492475748062134, "kl": 0.005148867785464972, "learning_rate": 9.572412894414696e-07, "loss": 5.1312148571014404e-05, "reward": 0.8926249742507935, "reward_std": 0.30370235443115234, "rewards/DrugCombAccuracyCOTORM/mean": 0.8853124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.314830482006073, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.5072392821311951, "step": 2983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 470.625, "completions/min_length": 391.0, "epoch": 4.3882352941176475, "frac_reward_zero_std": 1.0, "grad_norm": 0.010439595207571983, "kl": 0.004914382938295603, "learning_rate": 9.571893475922655e-07, "loss": 4.935319520882331e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 396.625, "completions/min_length": 344.0, "epoch": 4.389705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.039366286247968674, "kl": 0.008482449571602046, "learning_rate": 9.571373756245842e-07, "loss": 8.615148544777185e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 438.6875, "completions/min_length": 382.0, "epoch": 4.391176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.014329158701002598, "kl": 0.0053038444602862, "learning_rate": 9.570853735418494e-07, "loss": 5.3135609050514176e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 2986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 436.4375, "completions/min_length": 389.0, "epoch": 4.392647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.02349947579205036, "kl": 0.007570692338049412, "learning_rate": 9.570333413474868e-07, "loss": 7.557334174634889e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 478.5625, "completions/min_length": 437.0, "epoch": 4.394117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.00784770492464304, "kl": 0.005049657833296806, "learning_rate": 9.569812790449246e-07, "loss": 5.064411379862577e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/mean_length": 473.8125, "completions/min_length": 411.0, "epoch": 4.395588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.6080626249313354, "kl": 0.0068968418054282665, "learning_rate": 9.56929186637592e-07, "loss": 6.91637396812439e-05, "reward": 0.21774999797344208, "reward_std": 0.40113210678100586, "rewards/DrugCombAccuracyCOTORM/mean": 0.1966666579246521, "rewards/DrugCombAccuracyCOTORM/std": 0.40022215247154236, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.3958333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.9287087917327881, "step": 2989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 443.3125, "completions/min_length": 398.0, "epoch": 4.397058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.011643907986581326, "kl": 0.006747392239049077, "learning_rate": 9.56877064128921e-07, "loss": 6.687624409096316e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 437.875, "completions/min_length": 382.0, "epoch": 4.398529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8760647177696228, "kl": 0.005748632596805692, "learning_rate": 9.568249115223452e-07, "loss": 5.777409751317464e-05, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 2991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 479.8125, "completions/min_length": 435.0, "epoch": 4.4, "frac_reward_zero_std": 0.5, "grad_norm": 0.890601396560669, "kl": 0.006482783704996109, "learning_rate": 9.567727288213004e-07, "loss": 6.494112312793732e-05, "reward": 0.22500000894069672, "reward_std": 0.20528726279735565, "rewards/DrugCombAccuracyCOTORM/mean": 0.125, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.25, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 2992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 477.6875, "completions/min_length": 415.0, "epoch": 4.401470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.6832317113876343, "kl": 0.0069989770418033, "learning_rate": 9.56720516029224e-07, "loss": 7.005651423241943e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 461.1875, "completions/min_length": 417.0, "epoch": 4.402941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.010066204704344273, "kl": 0.005405166302807629, "learning_rate": 9.56668273149556e-07, "loss": 5.4318574257194996e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/mean_length": 491.25, "completions/min_length": 402.0, "epoch": 4.404411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9068887829780579, "kl": 0.007027282728813589, "learning_rate": 9.566160001857377e-07, "loss": 6.987154483795166e-05, "reward": 0.621791660785675, "reward_std": 0.15614084899425507, "rewards/DrugCombAccuracyCOTORM/mean": 0.5728124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.5018232464790344, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6354166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.46435439586639404, "step": 2995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 483.875, "completions/min_length": 423.0, "epoch": 4.405882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.1129624843597412, "kl": 0.007226949906907976, "learning_rate": 9.565636971412129e-07, "loss": 7.309764623641968e-05, "reward": 0.9937499761581421, "reward_std": 0.017677659168839455, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 2996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 437.5, "completions/min_length": 392.0, "epoch": 4.4073529411764705, "frac_reward_zero_std": 0.0, "grad_norm": 1.30789053440094, "kl": 0.007982417359016836, "learning_rate": 9.565113640194273e-07, "loss": 8.136779069900513e-05, "reward": 0.4761999845504761, "reward_std": 0.38764292001724243, "rewards/DrugCombAccuracyCOTORM/mean": 0.37024998664855957, "rewards/DrugCombAccuracyCOTORM/std": 0.44915273785591125, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.800000011920929, "rewards/DrugCombCoverageCOTORM/std": 0.5016639232635498, "step": 2997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/mean_length": 522.25, "completions/min_length": 451.0, "epoch": 4.408823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 1.2827873229980469, "kl": 0.008006305084563792, "learning_rate": 9.564590008238283e-07, "loss": 8.097290992736816e-05, "reward": 0.2227500081062317, "reward_std": 0.18354618549346924, "rewards/DrugCombAccuracyCOTORM/mean": 0.08312500268220901, "rewards/DrugCombAccuracyCOTORM/std": 0.2508510649204254, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 2998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 413.5, "completions/min_length": 375.0, "epoch": 4.410294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.009859916754066944, "kl": 0.004952622693963349, "learning_rate": 9.564066075578654e-07, "loss": 4.9528651288710535e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 2999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 475.5, "completions/min_length": 422.0, "epoch": 4.411764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9080517888069153, "kl": 0.006915897480212152, "learning_rate": 9.5635418422499e-07, "loss": 6.927181675564498e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 440.375, "completions/min_length": 388.0, "epoch": 4.413235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1522290706634521, "kl": 0.007132192840799689, "learning_rate": 9.563017308286561e-07, "loss": 7.073953747749329e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/mean_length": 451.875, "completions/min_length": 408.0, "epoch": 4.4147058823529415, "frac_reward_zero_std": 0.5, "grad_norm": 0.8153344988822937, "kl": 0.008433364680968225, "learning_rate": 9.56249247372319e-07, "loss": 8.440401870757341e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 439.5625, "completions/min_length": 382.0, "epoch": 4.416176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.015186503529548645, "kl": 0.007151346188038588, "learning_rate": 9.561967338594358e-07, "loss": 7.213393837446347e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 472.5, "completions/min_length": 376.0, "epoch": 4.41764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.199724793434143, "kl": 0.00808113208040595, "learning_rate": 9.561441902934667e-07, "loss": 8.047735173022375e-05, "reward": 0.5839166641235352, "reward_std": 0.17849332094192505, "rewards/DrugCombAccuracyCOTORM/mean": 0.5762500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.49902406334877014, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.2291666567325592, "rewards/DrugCombCoverageCOTORM/std": 0.9867173433303833, "step": 3004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 469.4375, "completions/min_length": 419.0, "epoch": 4.419117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.8871225714683533, "kl": 0.006311569828540087, "learning_rate": 9.560916166778723e-07, "loss": 6.303206464508548e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 451.6875, "completions/min_length": 372.0, "epoch": 4.420588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.008716301061213017, "kl": 0.004597840714268386, "learning_rate": 9.560390130161165e-07, "loss": 4.632554919226095e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 443.8125, "completions/min_length": 395.0, "epoch": 4.422058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.010756406001746655, "kl": 0.0057849648874253035, "learning_rate": 9.559863793116648e-07, "loss": 5.768111441284418e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/mean_length": 465.75, "completions/min_length": 354.0, "epoch": 4.423529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9247351884841919, "kl": 0.005368929821997881, "learning_rate": 9.559337155679841e-07, "loss": 5.404651165008545e-05, "reward": 0.8166666626930237, "reward_std": 0.18771812319755554, "rewards/DrugCombAccuracyCOTORM/mean": 0.7708333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.39849257469177246, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 455.75, "completions/min_length": 381.0, "epoch": 4.425, "frac_reward_zero_std": 1.0, "grad_norm": 0.01851453073322773, "kl": 0.005047462065704167, "learning_rate": 9.558810217885442e-07, "loss": 5.0494745664764196e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 449.0625, "completions/min_length": 410.0, "epoch": 4.426470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8787833452224731, "kl": 0.004281770088709891, "learning_rate": 9.558282979768163e-07, "loss": 4.281848669052124e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/mean_length": 504.625, "completions/min_length": 385.0, "epoch": 4.427941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.8877255320549011, "kl": 0.0064875956159085035, "learning_rate": 9.557755441362737e-07, "loss": 6.455183029174805e-05, "reward": 0.9750000238418579, "reward_std": 0.0707106739282608, "rewards/DrugCombAccuracyCOTORM/mean": 0.96875, "rewards/DrugCombAccuracyCOTORM/std": 0.125, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/mean_length": 494.625, "completions/min_length": 380.0, "epoch": 4.429411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.8553895950317383, "kl": 0.007074872381053865, "learning_rate": 9.557227602703918e-07, "loss": 7.087853737175465e-05, "reward": 0.8205000162124634, "reward_std": 0.1959603875875473, "rewards/DrugCombAccuracyCOTORM/mean": 0.7912499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.3766496777534485, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.2687419056892395, "step": 3012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 497.0625, "completions/min_length": 459.0, "epoch": 4.430882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 1.2641291618347168, "kl": 0.005911407410167158, "learning_rate": 9.556699463826474e-07, "loss": 5.911290645599365e-05, "reward": 0.47224998474121094, "reward_std": 0.2417517602443695, "rewards/DrugCombAccuracyCOTORM/mean": 0.39499998092651367, "rewards/DrugCombAccuracyCOTORM/std": 0.43158623576164246, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.6291528940200806, "step": 3013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 450.6875, "completions/min_length": 365.0, "epoch": 4.432352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.3853669166564941, "kl": 0.0071920526679605246, "learning_rate": 9.556171024765202e-07, "loss": 7.125735282897949e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/mean_length": 520.5625, "completions/min_length": 450.0, "epoch": 4.4338235294117645, "frac_reward_zero_std": 0.0, "grad_norm": 1.6899853944778442, "kl": 0.013773293816484511, "learning_rate": 9.555642285554915e-07, "loss": 0.00013677775859832764, "reward": 0.6974745988845825, "reward_std": 0.30388936400413513, "rewards/DrugCombAccuracyCOTORM/mean": 0.6556974649429321, "rewards/DrugCombAccuracyCOTORM/std": 0.39733317494392395, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7291666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.38429832458496094, "step": 3015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/mean_length": 481.4375, "completions/min_length": 372.0, "epoch": 4.435294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.005039581097662449, "kl": 0.0035702286404557526, "learning_rate": 9.555113246230442e-07, "loss": 3.555600414983928e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 3016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 406.8125, "completions/min_length": 347.0, "epoch": 4.436764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.7455194592475891, "kl": 0.005651299026794732, "learning_rate": 9.554583906826635e-07, "loss": 5.6974589824676514e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/mean_length": 506.8125, "completions/min_length": 369.0, "epoch": 4.438235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9960042834281921, "kl": 0.006433437927626073, "learning_rate": 9.554054267378368e-07, "loss": 6.496906280517578e-05, "reward": 0.6787708401679993, "reward_std": 0.14652597904205322, "rewards/DrugCombAccuracyCOTORM/mean": 0.6284114718437195, "rewards/DrugCombAccuracyCOTORM/std": 0.45031535625457764, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7604166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.2852468192577362, "step": 3018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/mean_length": 407.625, "completions/min_length": 354.0, "epoch": 4.439705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.007976341992616653, "kl": 0.004992050118744373, "learning_rate": 9.553524327920531e-07, "loss": 4.949884896632284e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 436.1875, "completions/min_length": 351.0, "epoch": 4.4411764705882355, "frac_reward_zero_std": 0.5, "grad_norm": 1.1445550918579102, "kl": 0.008126913919113576, "learning_rate": 9.552994088488032e-07, "loss": 8.166581392288208e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 465.3125, "completions/min_length": 407.0, "epoch": 4.442647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.013023355044424534, "kl": 0.006126814754679799, "learning_rate": 9.552463549115807e-07, "loss": 6.115966971265152e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 465.125, "completions/min_length": 429.0, "epoch": 4.444117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 1.526700735092163, "kl": 0.007630268461070955, "learning_rate": 9.551932709838804e-07, "loss": 7.645785808563232e-05, "reward": 0.512499988079071, "reward_std": 0.4153291583061218, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 479.4375, "completions/min_length": 419.0, "epoch": 4.445588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.06454336643219, "kl": 0.007505378685891628, "learning_rate": 9.551401570691993e-07, "loss": 7.66286175348796e-05, "reward": 0.74609375, "reward_std": 0.13358624279499054, "rewards/DrugCombAccuracyCOTORM/mean": 0.6979166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.3859512209892273, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8776041269302368, "rewards/DrugCombCoverageCOTORM/std": 0.12862229347229004, "step": 3023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 509.75, "completions/min_length": 455.0, "epoch": 4.447058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.3013721704483032, "kl": 0.008698859717696905, "learning_rate": 9.550870131710365e-07, "loss": 8.899718523025513e-05, "reward": 0.9115833044052124, "reward_std": 0.16982077062129974, "rewards/DrugCombAccuracyCOTORM/mean": 0.9025000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.26642072200775146, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.26440009474754333, "step": 3024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 458.125, "completions/min_length": 405.0, "epoch": 4.448529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.4635014533996582, "kl": 0.007553552626632154, "learning_rate": 9.55033839292893e-07, "loss": 7.569044828414917e-05, "reward": 0.6625000238418579, "reward_std": 0.3919961452484131, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 408.25, "completions/min_length": 322.0, "epoch": 4.45, "frac_reward_zero_std": 1.0, "grad_norm": 0.0171331986784935, "kl": 0.007686135242693126, "learning_rate": 9.549806354382715e-07, "loss": 7.643770368304104e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/mean_length": 476.8125, "completions/min_length": 323.0, "epoch": 4.451470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9701204895973206, "kl": 0.00530922063626349, "learning_rate": 9.549274016106774e-07, "loss": 5.325134043232538e-05, "reward": 0.887499988079071, "reward_std": 0.21001701056957245, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 480.5625, "completions/min_length": 431.0, "epoch": 4.452941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.3317315578460693, "kl": 0.007201476022601128, "learning_rate": 9.548741378136173e-07, "loss": 7.238611578941345e-05, "reward": 0.6770208477973938, "reward_std": 0.33235281705856323, "rewards/DrugCombAccuracyCOTORM/mean": 0.6151562929153442, "rewards/DrugCombAccuracyCOTORM/std": 0.39758381247520447, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8489583730697632, "rewards/DrugCombCoverageCOTORM/std": 0.15875309705734253, "step": 3028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 477.75, "completions/min_length": 419.0, "epoch": 4.454411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.008458071388304234, "kl": 0.004549748846329749, "learning_rate": 9.548208440506e-07, "loss": 4.5488566684070975e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 450.8125, "completions/min_length": 358.0, "epoch": 4.455882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.03947088494896889, "kl": 0.007450665696524084, "learning_rate": 9.547675203251366e-07, "loss": 7.376129360636696e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 463.625, "completions/min_length": 393.0, "epoch": 4.45735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.013638283126056194, "kl": 0.005843218765221536, "learning_rate": 9.547141666407398e-07, "loss": 5.878767842659727e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 415.9375, "completions/min_length": 329.0, "epoch": 4.458823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.008887390606105328, "kl": 0.004795192915480584, "learning_rate": 9.546607830009246e-07, "loss": 4.769949373439886e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 491.25, "completions/min_length": 400.0, "epoch": 4.4602941176470585, "frac_reward_zero_std": 1.0, "grad_norm": 0.009009946137666702, "kl": 0.0045446823933161795, "learning_rate": 9.546073694092076e-07, "loss": 4.557706415653229e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 463.0, "completions/min_length": 381.0, "epoch": 4.461764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8264477849006653, "kl": 0.005834856943693012, "learning_rate": 9.545539258691075e-07, "loss": 5.801791849080473e-05, "reward": 0.9178333282470703, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.9025000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.26642072200775146, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 3034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 458.125, "completions/min_length": 373.0, "epoch": 4.463235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.5247114896774292, "kl": 0.0057342154905200005, "learning_rate": 9.54500452384145e-07, "loss": 5.768239498138428e-05, "reward": 0.84375, "reward_std": 0.3386114239692688, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 423.6875, "completions/min_length": 380.0, "epoch": 4.464705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.02311627008020878, "kl": 0.005469167372211814, "learning_rate": 9.54446948957843e-07, "loss": 5.396276173996739e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 417.125, "completions/min_length": 365.0, "epoch": 4.466176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.0528216361999512, "kl": 0.00654745742212981, "learning_rate": 9.54393415593726e-07, "loss": 6.492766988230869e-05, "reward": 0.7534999847412109, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.7074999809265137, "rewards/DrugCombAccuracyCOTORM/std": 0.39000001549720764, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.1666666567325592, "step": 3037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 495.75, "completions/min_length": 438.0, "epoch": 4.4676470588235295, "frac_reward_zero_std": 0.0, "grad_norm": 1.4033734798431396, "kl": 0.0061051949160173535, "learning_rate": 9.543398522953208e-07, "loss": 6.120651960372925e-05, "reward": 0.5307333469390869, "reward_std": 0.30428430438041687, "rewards/DrugCombAccuracyCOTORM/mean": 0.421750009059906, "rewards/DrugCombAccuracyCOTORM/std": 0.4720044732093811, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9333333373069763, "rewards/DrugCombCoverageCOTORM/std": 0.12412656843662262, "step": 3038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 473.75, "completions/min_length": 424.0, "epoch": 4.469117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.764677107334137, "kl": 0.005425487644970417, "learning_rate": 9.542862590661557e-07, "loss": 5.443141708383337e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 439.0, "completions/min_length": 381.0, "epoch": 4.470588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.943456768989563, "kl": 0.007127232151106, "learning_rate": 9.542326359097617e-07, "loss": 7.073728193063289e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 444.3125, "completions/min_length": 388.0, "epoch": 4.472058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 0.8251609206199646, "kl": 0.005849164794199169, "learning_rate": 9.541789828296712e-07, "loss": 5.832858369103633e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 450.625, "completions/min_length": 400.0, "epoch": 4.473529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8866195678710938, "kl": 0.006619690451771021, "learning_rate": 9.541252998294188e-07, "loss": 6.593763828277588e-05, "reward": 0.7124999761581421, "reward_std": 0.24164614081382751, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 3042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 436.125, "completions/min_length": 390.0, "epoch": 4.475, "frac_reward_zero_std": 0.5, "grad_norm": 1.1485871076583862, "kl": 0.005664286902174354, "learning_rate": 9.540715869125407e-07, "loss": 5.684000643668696e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 430.1875, "completions/min_length": 357.0, "epoch": 4.476470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.012391136027872562, "kl": 0.0050132510950788856, "learning_rate": 9.540178440825754e-07, "loss": 5.048608727520332e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 3044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 452.625, "completions/min_length": 390.0, "epoch": 4.477941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.5282667875289917, "kl": 0.007597762392833829, "learning_rate": 9.539640713430637e-07, "loss": 7.646530866622925e-05, "reward": 0.7000000476837158, "reward_std": 0.3989730179309845, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 446.4375, "completions/min_length": 382.0, "epoch": 4.479411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.011592132970690727, "kl": 0.005743961548432708, "learning_rate": 9.53910268697548e-07, "loss": 5.743158544646576e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/mean_length": 475.625, "completions/min_length": 390.0, "epoch": 4.480882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.1706385612487793, "kl": 0.006827235338278115, "learning_rate": 9.538564361495723e-07, "loss": 6.808603211538866e-05, "reward": 0.6625000238418579, "reward_std": 0.2133909910917282, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 3047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 468.125, "completions/min_length": 445.0, "epoch": 4.482352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 14.197779655456543, "kl": 0.10240466229151934, "learning_rate": 9.53802573702683e-07, "loss": 0.0009726067655719817, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 460.0625, "completions/min_length": 410.0, "epoch": 4.483823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.8803820610046387, "kl": 0.00627692264970392, "learning_rate": 9.53748681360429e-07, "loss": 6.321072578430176e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/mean_length": 503.5625, "completions/min_length": 393.0, "epoch": 4.485294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.1341018676757812, "kl": 0.006340811029076576, "learning_rate": 9.536947591263598e-07, "loss": 6.327778100967407e-05, "reward": 0.6778673529624939, "reward_std": 0.02565295249223709, "rewards/DrugCombAccuracyCOTORM/mean": 0.6156718730926514, "rewards/DrugCombAccuracyCOTORM/std": 0.3982185125350952, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8532986044883728, "rewards/DrugCombCoverageCOTORM/std": 0.17887169122695923, "step": 3050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 438.0, "completions/min_length": 362.0, "epoch": 4.4867647058823525, "frac_reward_zero_std": 0.5, "grad_norm": 1.0040804147720337, "kl": 0.005316208116710186, "learning_rate": 9.536408070040281e-07, "loss": 5.275291914585978e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 429.875, "completions/min_length": 369.0, "epoch": 4.488235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01247339602559805, "kl": 0.0062938976334407926, "learning_rate": 9.535868249969882e-07, "loss": 6.277368811424822e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 434.875, "completions/min_length": 370.0, "epoch": 4.489705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.019478432834148407, "kl": 0.007396831992082298, "learning_rate": 9.535328131087961e-07, "loss": 7.510090654250234e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 3053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 443.5, "completions/min_length": 389.0, "epoch": 4.491176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.5074201822280884, "kl": 0.00537711929064244, "learning_rate": 9.534787713430099e-07, "loss": 5.361183502827771e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 426.25, "completions/min_length": 359.0, "epoch": 4.492647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.00930122658610344, "kl": 0.00593754171859473, "learning_rate": 9.5342469970319e-07, "loss": 5.925237201154232e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 416.75, "completions/min_length": 362.0, "epoch": 4.4941176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.010478805750608444, "kl": 0.005047908693086356, "learning_rate": 9.533705981928982e-07, "loss": 5.110143683850765e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 471.1875, "completions/min_length": 398.0, "epoch": 4.495588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9654192924499512, "kl": 0.005774392338935286, "learning_rate": 9.533164668156989e-07, "loss": 5.750139825977385e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 431.5625, "completions/min_length": 342.0, "epoch": 4.497058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.013317609205842018, "kl": 0.005912162596359849, "learning_rate": 9.532623055751578e-07, "loss": 5.8677938795881346e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 471.375, "completions/min_length": 405.0, "epoch": 4.498529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.982909619808197, "kl": 0.005744208465330303, "learning_rate": 9.532081144748432e-07, "loss": 5.7382207160117105e-05, "reward": 0.9666666984558105, "reward_std": 0.061721328645944595, "rewards/DrugCombAccuracyCOTORM/mean": 0.9583333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.11385500431060791, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/mean_length": 519.6875, "completions/min_length": 479.0, "epoch": 4.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.012939457781612873, "kl": 0.00578085333108902, "learning_rate": 9.531538935183249e-07, "loss": 5.780319042969495e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 463.1875, "completions/min_length": 416.0, "epoch": 4.501470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.9818336963653564, "kl": 0.006323447218164802, "learning_rate": 9.530996427091748e-07, "loss": 6.304035196080804e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 3061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 464.1875, "completions/min_length": 374.0, "epoch": 4.502941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.021286973729729652, "kl": 0.005920878727920353, "learning_rate": 9.530453620509671e-07, "loss": 5.981499998597428e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 419.0625, "completions/min_length": 336.0, "epoch": 4.504411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.010656235739588737, "kl": 0.005660819471813738, "learning_rate": 9.529910515472775e-07, "loss": 5.64684669370763e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 497.0, "completions/min_length": 419.0, "epoch": 4.5058823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.9971792101860046, "kl": 0.0170155456289649, "learning_rate": 9.529367112016835e-07, "loss": 0.0001642853021621704, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 442.0625, "completions/min_length": 392.0, "epoch": 4.507352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.007624940015375614, "kl": 0.005622714990749955, "learning_rate": 9.528823410177654e-07, "loss": 5.612754830508493e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/mean_length": 480.4375, "completions/min_length": 402.0, "epoch": 4.508823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.0286940336227417, "kl": 0.006301730289123952, "learning_rate": 9.528279409991047e-07, "loss": 6.308406591415405e-05, "reward": 0.9750000238418579, "reward_std": 0.0707106739282608, "rewards/DrugCombAccuracyCOTORM/mean": 0.96875, "rewards/DrugCombAccuracyCOTORM/std": 0.125, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/mean_length": 497.625, "completions/min_length": 426.0, "epoch": 4.510294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.8735039234161377, "kl": 0.007297781063243747, "learning_rate": 9.527735111492852e-07, "loss": 7.371466199401766e-05, "reward": 0.6273333430290222, "reward_std": 0.04703797399997711, "rewards/DrugCombAccuracyCOTORM/mean": 0.5550000071525574, "rewards/DrugCombAccuracyCOTORM/std": 0.4665619134902954, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 3067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 438.0625, "completions/min_length": 367.0, "epoch": 4.511764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.00886167585849762, "kl": 0.004369884671177715, "learning_rate": 9.527190514718927e-07, "loss": 4.434536458575167e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 3068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 449.1875, "completions/min_length": 410.0, "epoch": 4.5132352941176475, "frac_reward_zero_std": 1.0, "grad_norm": 0.03325178846716881, "kl": 0.006852305261418223, "learning_rate": 9.526645619705147e-07, "loss": 6.785391451558098e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 476.8125, "completions/min_length": 416.0, "epoch": 4.514705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.008394218981266022, "kl": 0.004785052151419222, "learning_rate": 9.526100426487409e-07, "loss": 4.8109530325746164e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 480.0, "completions/min_length": 438.0, "epoch": 4.516176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.9212526082992554, "kl": 0.022647747537121177, "learning_rate": 9.52555493510163e-07, "loss": 0.0002271551638841629, "reward": 0.7875000238418579, "reward_std": 0.2295181304216385, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 418.9375, "completions/min_length": 371.0, "epoch": 4.517647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.9140196442604065, "kl": 0.00718304724432528, "learning_rate": 9.525009145583744e-07, "loss": 7.187575101852417e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 432.8125, "completions/min_length": 352.0, "epoch": 4.519117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.8198493123054504, "kl": 0.006187521968968213, "learning_rate": 9.524463057969707e-07, "loss": 6.202459189807996e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 3073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 459.125, "completions/min_length": 373.0, "epoch": 4.520588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9535241723060608, "kl": 0.005807889509014785, "learning_rate": 9.523916672295493e-07, "loss": 5.7913413911592215e-05, "reward": 0.887499988079071, "reward_std": 0.21001701056957245, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/mean_length": 418.75, "completions/min_length": 378.0, "epoch": 4.522058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.02158273756504059, "kl": 0.0060256957076489925, "learning_rate": 9.523369988597099e-07, "loss": 6.02584368607495e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/mean_length": 469.4375, "completions/min_length": 420.0, "epoch": 4.523529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9844986200332642, "kl": 0.006024971138685942, "learning_rate": 9.522823006910537e-07, "loss": 5.9457375755300745e-05, "reward": 0.8955755233764648, "reward_std": 0.032590001821517944, "rewards/DrugCombAccuracyCOTORM/mean": 0.877281904220581, "rewards/DrugCombAccuracyCOTORM/std": 0.13640707731246948, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.08333335071802139, "step": 3076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 450.8125, "completions/min_length": 385.0, "epoch": 4.525, "frac_reward_zero_std": 1.0, "grad_norm": 0.06946162134408951, "kl": 0.009988822508603334, "learning_rate": 9.522275727271841e-07, "loss": 0.00010030737757915631, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 445.0625, "completions/min_length": 382.0, "epoch": 4.526470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.015894459560513496, "kl": 0.00542652141302824, "learning_rate": 9.521728149717063e-07, "loss": 5.440493987407535e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 428.1875, "completions/min_length": 381.0, "epoch": 4.527941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.010116546414792538, "kl": 0.006064945831894875, "learning_rate": 9.521180274282278e-07, "loss": 6.071694224374369e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 450.9375, "completions/min_length": 429.0, "epoch": 4.529411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.0068876659497618675, "kl": 0.004357951984275132, "learning_rate": 9.520632101003579e-07, "loss": 4.357975922175683e-05, "reward": 0.6713333129882812, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6100000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.40279027819633484, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 3080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 460.8125, "completions/min_length": 418.0, "epoch": 4.530882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.0853660106658936, "kl": 0.007246233057230711, "learning_rate": 9.520083629917078e-07, "loss": 7.227073365356773e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 407.875, "completions/min_length": 369.0, "epoch": 4.5323529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.03295156732201576, "kl": 0.00636248046066612, "learning_rate": 9.519534861058904e-07, "loss": 6.451187073253095e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 3082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/mean_length": 519.6875, "completions/min_length": 450.0, "epoch": 4.533823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 1.3823094367980957, "kl": 0.005874216090887785, "learning_rate": 9.518985794465213e-07, "loss": 5.8554112911224365e-05, "reward": 0.6502500176429749, "reward_std": 0.2741650342941284, "rewards/DrugCombAccuracyCOTORM/mean": 0.6018750071525574, "rewards/DrugCombAccuracyCOTORM/std": 0.45814070105552673, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.6800735592842102, "step": 3083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 480.8125, "completions/min_length": 362.0, "epoch": 4.535294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.00978450570255518, "kl": 0.004835040483158082, "learning_rate": 9.518436430172174e-07, "loss": 4.8130219511222094e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 423.875, "completions/min_length": 351.0, "epoch": 4.536764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9393509030342102, "kl": 0.006273979204706848, "learning_rate": 9.517886768215976e-07, "loss": 6.323337584035471e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 442.5625, "completions/min_length": 370.0, "epoch": 4.538235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1692041158676147, "kl": 0.007424959214404225, "learning_rate": 9.517336808632833e-07, "loss": 7.44350254535675e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 417.25, "completions/min_length": 354.0, "epoch": 4.5397058823529415, "frac_reward_zero_std": 0.5, "grad_norm": 1.0881712436676025, "kl": 0.005173506564460695, "learning_rate": 9.516786551458973e-07, "loss": 5.1975250244140625e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 404.4375, "completions/min_length": 315.0, "epoch": 4.541176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.010231459513306618, "kl": 0.006878909422084689, "learning_rate": 9.516235996730644e-07, "loss": 6.879161082906649e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/mean_length": 494.6875, "completions/min_length": 407.0, "epoch": 4.54264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.018970878794789314, "kl": 0.007111985818482935, "learning_rate": 9.515685144484118e-07, "loss": 7.107958663254976e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 3089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 439.0, "completions/min_length": 395.0, "epoch": 4.544117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.016602646559476852, "kl": 0.005772284290287644, "learning_rate": 9.515133994755682e-07, "loss": 5.763053923146799e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/mean_length": 481.75, "completions/min_length": 403.0, "epoch": 4.545588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9120445251464844, "kl": 0.0051209008088335395, "learning_rate": 9.514582547581645e-07, "loss": 5.100693306303583e-05, "reward": 0.9489166736602783, "reward_std": 0.11560136079788208, "rewards/DrugCombAccuracyCOTORM/mean": 0.9387500286102295, "rewards/DrugCombAccuracyCOTORM/std": 0.1980530321598053, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 3091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 456.4375, "completions/min_length": 402.0, "epoch": 4.547058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.016546178609132767, "kl": 0.006397432065568864, "learning_rate": 9.514030802998334e-07, "loss": 6.391834176611155e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 439.5, "completions/min_length": 379.0, "epoch": 4.548529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8720231652259827, "kl": 0.0078006163239479065, "learning_rate": 9.513478761042098e-07, "loss": 7.703644223511219e-05, "reward": 0.9019333124160767, "reward_std": 0.18418778479099274, "rewards/DrugCombAccuracyCOTORM/mean": 0.8878333568572998, "rewards/DrugCombAccuracyCOTORM/std": 0.30878183245658875, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 3093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 422.1875, "completions/min_length": 389.0, "epoch": 4.55, "frac_reward_zero_std": 0.0, "grad_norm": 1.3498696088790894, "kl": 0.006282178102992475, "learning_rate": 9.512926421749303e-07, "loss": 6.288290023803711e-05, "reward": 0.4749999940395355, "reward_std": 0.18771235644817352, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.25, "rewards/DrugCombCoverageCOTORM/std": 1.0, "step": 3094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/mean_length": 419.1875, "completions/min_length": 375.0, "epoch": 4.551470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.027559679001569748, "kl": 0.0071140696527436376, "learning_rate": 9.512373785156336e-07, "loss": 7.114920299500227e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 457.625, "completions/min_length": 368.0, "epoch": 4.552941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.723448395729065, "kl": 0.008560583693906665, "learning_rate": 9.511820851299604e-07, "loss": 8.548051118850708e-05, "reward": 0.3019374907016754, "reward_std": 0.2906763553619385, "rewards/DrugCombAccuracyCOTORM/mean": 0.20750001072883606, "rewards/DrugCombAccuracyCOTORM/std": 0.326751708984375, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.4013864994049072, "step": 3096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 419.1875, "completions/min_length": 368.0, "epoch": 4.554411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.0615951232612133, "kl": 0.010309401666745543, "learning_rate": 9.511267620215532e-07, "loss": 0.00010000656038755551, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 458.8125, "completions/min_length": 398.0, "epoch": 4.555882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.9477919340133667, "kl": 0.009812175645492971, "learning_rate": 9.510714091940566e-07, "loss": 9.91038978099823e-05, "reward": 0.6625000238418579, "reward_std": 0.2133909910917282, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 3098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 469.8125, "completions/min_length": 439.0, "epoch": 4.557352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.013479831628501415, "kl": 0.006704961066134274, "learning_rate": 9.510160266511171e-07, "loss": 6.690664304187521e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/mean_length": 478.625, "completions/min_length": 343.0, "epoch": 4.5588235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 0.8036598563194275, "kl": 0.004814131127204746, "learning_rate": 9.509606143963831e-07, "loss": 4.806648939847946e-05, "reward": 0.8921874761581421, "reward_std": 0.20032759010791779, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 414.4375, "completions/min_length": 383.0, "epoch": 4.560294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.018140286207199097, "kl": 0.007854917668737471, "learning_rate": 9.50905172433505e-07, "loss": 7.872057904023677e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/mean_length": 535.1875, "completions/min_length": 452.0, "epoch": 4.561764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.7128101587295532, "kl": 0.010819024289958179, "learning_rate": 9.508497007661353e-07, "loss": 0.00010938569903373718, "reward": 0.3812500238418579, "reward_std": 0.4462881088256836, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3125, "rewards/DrugCombCoverageCOTORM/std": 0.7932003140449524, "step": 3102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 409.6875, "completions/min_length": 343.0, "epoch": 4.563235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9809228777885437, "kl": 0.006252582301385701, "learning_rate": 9.507941993979282e-07, "loss": 6.182988727232441e-05, "reward": 0.7749999761581421, "reward_std": 0.24348658323287964, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 3103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 409.0, "completions/min_length": 349.0, "epoch": 4.564705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.011065523140132427, "kl": 0.005924721248447895, "learning_rate": 9.507386683325402e-07, "loss": 5.922053605900146e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 419.75, "completions/min_length": 374.0, "epoch": 4.5661764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.023717598989605904, "kl": 0.005889866384677589, "learning_rate": 9.506831075736294e-07, "loss": 5.9446254454087466e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 3105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 475.0, "completions/min_length": 447.0, "epoch": 4.567647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.047642532736063004, "kl": 0.008581315982155502, "learning_rate": 9.506275171248559e-07, "loss": 8.602793968748301e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 430.4375, "completions/min_length": 382.0, "epoch": 4.569117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.02144629880785942, "kl": 0.0076246054959483445, "learning_rate": 9.505718969898822e-07, "loss": 7.595008355565369e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/mean_length": 389.875, "completions/min_length": 350.0, "epoch": 4.570588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.3085283041000366, "kl": 0.006069350347388536, "learning_rate": 9.505162471723718e-07, "loss": 6.0185790061950684e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 467.9375, "completions/min_length": 401.0, "epoch": 4.572058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.056608881801366806, "kl": 0.008979117381386459, "learning_rate": 9.504605676759915e-07, "loss": 8.965016604634002e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 485.0625, "completions/min_length": 445.0, "epoch": 4.573529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.009107880294322968, "kl": 0.005638732691295445, "learning_rate": 9.504048585044088e-07, "loss": 5.6250930356327444e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 419.5, "completions/min_length": 359.0, "epoch": 4.575, "frac_reward_zero_std": 1.0, "grad_norm": 0.01558745838701725, "kl": 0.007096189598087221, "learning_rate": 9.503491196612938e-07, "loss": 7.09773667040281e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 442.125, "completions/min_length": 401.0, "epoch": 4.576470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.016681058332324028, "kl": 0.005686535965651274, "learning_rate": 9.502933511503186e-07, "loss": 5.715073348255828e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/mean_length": 548.6875, "completions/min_length": 452.0, "epoch": 4.577941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.1764390468597412, "kl": 0.005209362541791052, "learning_rate": 9.502375529751569e-07, "loss": 5.2347779273986816e-05, "reward": 0.5987499952316284, "reward_std": 0.3711315989494324, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.987500011920929, "rewards/DrugCombCoverageCOTORM/std": 0.05000000074505806, "step": 3113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 432.125, "completions/min_length": 385.0, "epoch": 4.579411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.027974801138043404, "kl": 0.005839944118633866, "learning_rate": 9.501817251394848e-07, "loss": 5.819090438308194e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 435.875, "completions/min_length": 396.0, "epoch": 4.580882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.7658040523529053, "kl": 0.005517430487088859, "learning_rate": 9.501258676469799e-07, "loss": 5.535408854484558e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 491.4375, "completions/min_length": 435.0, "epoch": 4.58235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.015140721574425697, "kl": 0.007371348678134382, "learning_rate": 9.500699805013217e-07, "loss": 7.34011919121258e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 437.4375, "completions/min_length": 360.0, "epoch": 4.583823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.02074911631643772, "kl": 0.0060577631229534745, "learning_rate": 9.500140637061924e-07, "loss": 6.069816299714148e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 482.375, "completions/min_length": 412.0, "epoch": 4.5852941176470585, "frac_reward_zero_std": 0.5, "grad_norm": 0.9868994951248169, "kl": 0.006456092000007629, "learning_rate": 9.499581172652753e-07, "loss": 6.405504245776683e-05, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 3118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 489.5625, "completions/min_length": 415.0, "epoch": 4.586764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.4471617937088013, "kl": 0.008439913159236312, "learning_rate": 9.499021411822563e-07, "loss": 8.405745029449463e-05, "reward": 0.42929166555404663, "reward_std": 0.3714209198951721, "rewards/DrugCombAccuracyCOTORM/mean": 0.3399999737739563, "rewards/DrugCombAccuracyCOTORM/std": 0.463101863861084, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5729166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.6468406319618225, "step": 3119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 431.125, "completions/min_length": 377.0, "epoch": 4.588235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.00992367323487997, "kl": 0.005983006791211665, "learning_rate": 9.498461354608227e-07, "loss": 5.9675268857972696e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/mean_length": 481.125, "completions/min_length": 345.0, "epoch": 4.589705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9827175140380859, "kl": 0.00554765434935689, "learning_rate": 9.49790100104664e-07, "loss": 5.4389201977755874e-05, "reward": 0.9775833487510681, "reward_std": 0.0634039044380188, "rewards/DrugCombAccuracyCOTORM/mean": 0.9771875143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.09125000238418579, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.1666666567325592, "step": 3121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 444.0625, "completions/min_length": 366.0, "epoch": 4.591176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9649860262870789, "kl": 0.005421450943686068, "learning_rate": 9.497340351174718e-07, "loss": 5.4305215599015355e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 399.9375, "completions/min_length": 360.0, "epoch": 4.5926470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.0180743969976902, "kl": 0.005576566676609218, "learning_rate": 9.496779405029397e-07, "loss": 5.641106690745801e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 465.0625, "completions/min_length": 421.0, "epoch": 4.594117647058823, "frac_reward_zero_std": 0.0, "grad_norm": 1.4748539924621582, "kl": 0.007780911982990801, "learning_rate": 9.496218162647628e-07, "loss": 7.744133472442627e-05, "reward": 0.8464166522026062, "reward_std": 0.32451075315475464, "rewards/DrugCombAccuracyCOTORM/mean": 0.8262500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.3764195442199707, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.3435921370983124, "step": 3124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 446.9375, "completions/min_length": 401.0, "epoch": 4.595588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.027549684047698975, "kl": 0.0084807324456051, "learning_rate": 9.495656624066384e-07, "loss": 8.406285633100197e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 437.8125, "completions/min_length": 365.0, "epoch": 4.597058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.010133819654583931, "kl": 0.005048030172474682, "learning_rate": 9.49509478932266e-07, "loss": 5.084574513603002e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 436.1875, "completions/min_length": 391.0, "epoch": 4.598529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.007595469243824482, "kl": 0.005132935184519738, "learning_rate": 9.494532658453466e-07, "loss": 5.111066275276244e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 409.5625, "completions/min_length": 367.0, "epoch": 4.6, "frac_reward_zero_std": 1.0, "grad_norm": 0.015478122979402542, "kl": 0.006891260854899883, "learning_rate": 9.493970231495834e-07, "loss": 6.938635488040745e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 441.1875, "completions/min_length": 397.0, "epoch": 4.601470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.03324881196022034, "kl": 0.005215798155404627, "learning_rate": 9.493407508486817e-07, "loss": 5.189021976548247e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/mean_length": 524.125, "completions/min_length": 453.0, "epoch": 4.602941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9394621849060059, "kl": 0.007303006597794592, "learning_rate": 9.492844489463485e-07, "loss": 7.357047434197739e-05, "reward": 0.6564375162124634, "reward_std": 0.01760922558605671, "rewards/DrugCombAccuracyCOTORM/mean": 0.596495509147644, "rewards/DrugCombAccuracyCOTORM/std": 0.4173080325126648, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7924107313156128, "rewards/DrugCombCoverageCOTORM/std": 0.22435423731803894, "step": 3130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 451.0, "completions/min_length": 401.0, "epoch": 4.604411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.0432276725769043, "kl": 0.007096450892277062, "learning_rate": 9.492281174462929e-07, "loss": 7.137676584534347e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 437.5625, "completions/min_length": 304.0, "epoch": 4.605882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.01823817566037178, "kl": 0.005822794046252966, "learning_rate": 9.491717563522255e-07, "loss": 5.798901111120358e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 450.6875, "completions/min_length": 396.0, "epoch": 4.607352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.015529593452811241, "kl": 0.0073782955296337605, "learning_rate": 9.491153656678596e-07, "loss": 7.334395195357502e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 455.8125, "completions/min_length": 397.0, "epoch": 4.608823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.013567391782999039, "kl": 0.006374728516675532, "learning_rate": 9.4905894539691e-07, "loss": 6.311549805104733e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 492.4375, "completions/min_length": 405.0, "epoch": 4.610294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9400851726531982, "kl": 0.0071181937819346786, "learning_rate": 9.490024955430935e-07, "loss": 7.085396646289155e-05, "reward": 0.8589166402816772, "reward_std": 0.19595398008823395, "rewards/DrugCombAccuracyCOTORM/mean": 0.8262500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.3764195442199707, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 3135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 415.625, "completions/min_length": 358.0, "epoch": 4.6117647058823525, "frac_reward_zero_std": 1.0, "grad_norm": 0.011526207439601421, "kl": 0.006210105726495385, "learning_rate": 9.48946016110129e-07, "loss": 6.169762491481379e-05, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 3136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 487.9375, "completions/min_length": 392.0, "epoch": 4.613235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.008368713781237602, "kl": 0.005380903487093747, "learning_rate": 9.48889507101737e-07, "loss": 5.3422641940414906e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 459.75, "completions/min_length": 412.0, "epoch": 4.614705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.19963307678699493, "kl": 0.009493162157014012, "learning_rate": 9.488329685216403e-07, "loss": 9.638574556447566e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 463.875, "completions/min_length": 408.0, "epoch": 4.616176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.013211374171078205, "kl": 0.005578575248364359, "learning_rate": 9.487764003735634e-07, "loss": 5.607716229860671e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 443.8125, "completions/min_length": 393.0, "epoch": 4.617647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.017645614221692085, "kl": 0.006940511171706021, "learning_rate": 9.48719802661233e-07, "loss": 6.9400281063281e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 438.75, "completions/min_length": 377.0, "epoch": 4.6191176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.81138676404953, "kl": 0.0066442572278901935, "learning_rate": 9.486631753883776e-07, "loss": 6.660589133389294e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/mean_length": 481.9375, "completions/min_length": 403.0, "epoch": 4.620588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.914307713508606, "kl": 0.00882779888343066, "learning_rate": 9.486065185587276e-07, "loss": 8.752469875616953e-05, "reward": 0.887499988079071, "reward_std": 0.21001699566841125, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/mean_length": 481.1875, "completions/min_length": 402.0, "epoch": 4.622058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.5345662832260132, "kl": 0.006704570143483579, "learning_rate": 9.485498321760155e-07, "loss": 6.648898124694824e-05, "reward": 0.5458333492279053, "reward_std": 0.42174696922302246, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4583333432674408, "rewards/DrugCombCoverageCOTORM/std": 0.7187952995300293, "step": 3143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 425.6875, "completions/min_length": 369.0, "epoch": 4.623529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.8707817196846008, "kl": 0.008044306538067758, "learning_rate": 9.484931162439756e-07, "loss": 8.001178503036499e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 445.625, "completions/min_length": 389.0, "epoch": 4.625, "frac_reward_zero_std": 1.0, "grad_norm": 0.027187252417206764, "kl": 0.007557630422525108, "learning_rate": 9.484363707663441e-07, "loss": 7.555309275630862e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 452.4375, "completions/min_length": 421.0, "epoch": 4.626470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.008474212139844894, "kl": 0.00455251126550138, "learning_rate": 9.483795957468594e-07, "loss": 4.552960308501497e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 436.1875, "completions/min_length": 363.0, "epoch": 4.627941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.1503417491912842, "kl": 0.01352351251989603, "learning_rate": 9.483227911892616e-07, "loss": 0.00013468367978930473, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 3147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 455.25, "completions/min_length": 394.0, "epoch": 4.629411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.010780276730656624, "kl": 0.006362871383316815, "learning_rate": 9.482659570972929e-07, "loss": 6.355212099151686e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 415.4375, "completions/min_length": 326.0, "epoch": 4.6308823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.9360109567642212, "kl": 0.006272929720580578, "learning_rate": 9.482090934746973e-07, "loss": 6.249449506867677e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 446.8125, "completions/min_length": 394.0, "epoch": 4.632352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.0077108596451580524, "kl": 0.005318364244885743, "learning_rate": 9.481522003252212e-07, "loss": 5.3077092161402106e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 419.0625, "completions/min_length": 361.0, "epoch": 4.633823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.010513140819966793, "kl": 0.005777257494628429, "learning_rate": 9.480952776526119e-07, "loss": 5.7802069932222366e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 429.1875, "completions/min_length": 381.0, "epoch": 4.635294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.20661097764968872, "kl": 0.012822049553506076, "learning_rate": 9.480383254606199e-07, "loss": 0.00012832946958951652, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/mean_length": 461.75, "completions/min_length": 370.0, "epoch": 4.636764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.804056704044342, "kl": 0.0064077062997967005, "learning_rate": 9.479813437529969e-07, "loss": 6.423622835427523e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 3153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/mean_length": 405.75, "completions/min_length": 363.0, "epoch": 4.6382352941176475, "frac_reward_zero_std": 0.0, "grad_norm": 1.6824709177017212, "kl": 0.007199375424534082, "learning_rate": 9.479243325334964e-07, "loss": 7.204711437225342e-05, "reward": 0.8937499523162842, "reward_std": 0.3005203604698181, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/mean_length": 543.375, "completions/min_length": 476.0, "epoch": 4.639705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 1.1295474767684937, "kl": 0.0059715971583500504, "learning_rate": 9.478672918058747e-07, "loss": 6.014108657836914e-05, "reward": 0.7392857074737549, "reward_std": 0.2701093256473541, "rewards/DrugCombAccuracyCOTORM/mean": 0.7053571939468384, "rewards/DrugCombAccuracyCOTORM/std": 0.4252850115299225, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 3155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/mean_length": 511.5, "completions/min_length": 440.0, "epoch": 4.641176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 132.8424530029297, "kl": 1.3530955773312598, "learning_rate": 9.478102215738891e-07, "loss": 0.014225076884031296, "reward": 0.3787708282470703, "reward_std": 0.3717536926269531, "rewards/DrugCombAccuracyCOTORM/mean": 0.2775000035762787, "rewards/DrugCombAccuracyCOTORM/std": 0.43701261281967163, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5833333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.7044829726219177, "step": 3156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 495.5, "completions/min_length": 374.0, "epoch": 4.642647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.9925033450126648, "kl": 0.006517384550534189, "learning_rate": 9.477531218412994e-07, "loss": 6.536021828651428e-05, "reward": 0.9166666865348816, "reward_std": 0.0690065324306488, "rewards/DrugCombAccuracyCOTORM/mean": 0.8958333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.15957117080688477, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 439.625, "completions/min_length": 352.0, "epoch": 4.644117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.007282308302819729, "kl": 0.004831754835322499, "learning_rate": 9.476959926118672e-07, "loss": 4.832989361602813e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 476.6875, "completions/min_length": 410.0, "epoch": 4.645588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.26764976978302, "kl": 0.00787571631371975, "learning_rate": 9.47638833889356e-07, "loss": 7.884949445724487e-05, "reward": 0.7602500319480896, "reward_std": 0.37320438027381897, "rewards/DrugCombAccuracyCOTORM/mean": 0.7081249952316284, "rewards/DrugCombAccuracyCOTORM/std": 0.4495474696159363, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.17078252136707306, "step": 3159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 439.0, "completions/min_length": 394.0, "epoch": 4.647058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.0701751708984375, "kl": 0.0069174631498754025, "learning_rate": 9.475816456775312e-07, "loss": 6.80381417623721e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 454.0, "completions/min_length": 413.0, "epoch": 4.648529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.2284977436065674, "kl": 0.010622267378494143, "learning_rate": 9.475244279801602e-07, "loss": 0.00010598450899124146, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 462.4375, "completions/min_length": 402.0, "epoch": 4.65, "frac_reward_zero_std": 0.5, "grad_norm": 0.9268366694450378, "kl": 0.007574848714284599, "learning_rate": 9.474671808010125e-07, "loss": 7.576495409011841e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 431.8125, "completions/min_length": 341.0, "epoch": 4.651470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.2987984418869019, "kl": 0.006407545646652579, "learning_rate": 9.474099041438593e-07, "loss": 6.397813558578491e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/mean_length": 507.375, "completions/min_length": 425.0, "epoch": 4.652941176470589, "frac_reward_zero_std": 0.0, "grad_norm": 1.4157999753952026, "kl": 0.008508714847266674, "learning_rate": 9.47352598012474e-07, "loss": 8.578598499298096e-05, "reward": 0.40954166650772095, "reward_std": 0.12310808151960373, "rewards/DrugCombAccuracyCOTORM/mean": 0.289642870426178, "rewards/DrugCombAccuracyCOTORM/std": 0.18355782330036163, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7782738208770752, "rewards/DrugCombCoverageCOTORM/std": 0.22381268441677094, "step": 3164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 421.6875, "completions/min_length": 388.0, "epoch": 4.654411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.02347516268491745, "kl": 0.011133361142128706, "learning_rate": 9.472952624106314e-07, "loss": 0.00011138913396280259, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 445.125, "completions/min_length": 404.0, "epoch": 4.655882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.009668962098658085, "kl": 0.005199869861826301, "learning_rate": 9.472378973421091e-07, "loss": 5.177784987608902e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 797.0, "completions/mean_length": 546.9375, "completions/min_length": 459.0, "epoch": 4.6573529411764705, "frac_reward_zero_std": 0.0, "grad_norm": 1.3806804418563843, "kl": 0.006918343249708414, "learning_rate": 9.471805028106858e-07, "loss": 6.998702883720398e-05, "reward": 0.11100701987743378, "reward_std": 0.16328567266464233, "rewards/DrugCombAccuracyCOTORM/mean": 0.060633763670921326, "rewards/DrugCombAccuracyCOTORM/std": 0.16577264666557312, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.375, "rewards/DrugCombCoverageCOTORM/std": 0.7187952995300293, "step": 3167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 449.0, "completions/min_length": 381.0, "epoch": 4.658823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.9999737739562988, "kl": 0.007919338531792164, "learning_rate": 9.471230788201428e-07, "loss": 7.958230708027259e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 508.4375, "completions/min_length": 460.0, "epoch": 4.660294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9591765999794006, "kl": 0.006654846656601876, "learning_rate": 9.470656253742628e-07, "loss": 6.656348705291748e-05, "reward": 0.9677083492279053, "reward_std": 0.06179160997271538, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.5072392821311951, "step": 3169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 478.0, "completions/min_length": 403.0, "epoch": 4.661764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0630098581314087, "kl": 0.006704967934638262, "learning_rate": 9.470081424768308e-07, "loss": 6.633251905441284e-05, "reward": 0.942187488079071, "reward_std": 0.16351844370365143, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 456.625, "completions/min_length": 395.0, "epoch": 4.663235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.00900785718113184, "kl": 0.005572901456616819, "learning_rate": 9.469506301316334e-07, "loss": 5.5795062507968396e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/mean_length": 493.6875, "completions/min_length": 424.0, "epoch": 4.6647058823529415, "frac_reward_zero_std": 0.0, "grad_norm": 1.6505026817321777, "kl": 0.006202509975992143, "learning_rate": 9.4689308834246e-07, "loss": 6.330013275146484e-05, "reward": 0.359749972820282, "reward_std": 0.31124213337898254, "rewards/DrugCombAccuracyCOTORM/mean": 0.27000001072883606, "rewards/DrugCombAccuracyCOTORM/std": 0.3762977719306946, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.4166666865348816, "step": 3172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 439.3125, "completions/min_length": 382.0, "epoch": 4.666176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9710910320281982, "kl": 0.007227896247059107, "learning_rate": 9.468355171131006e-07, "loss": 7.253885269165039e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 429.0625, "completions/min_length": 398.0, "epoch": 4.66764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.020137790590524673, "kl": 0.006669726630207151, "learning_rate": 9.467779164473482e-07, "loss": 6.684706750093028e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/mean_length": 516.4375, "completions/min_length": 429.0, "epoch": 4.669117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.852975606918335, "kl": 0.008045558352023363, "learning_rate": 9.467202863489972e-07, "loss": 8.042904664762318e-05, "reward": 0.9302083253860474, "reward_std": 0.09717614203691483, "rewards/DrugCombAccuracyCOTORM/mean": 0.9166666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.18257419764995575, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 3175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 444.0625, "completions/min_length": 396.0, "epoch": 4.670588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.1769808530807495, "kl": 0.007119092275388539, "learning_rate": 9.466626268218445e-07, "loss": 7.127970457077026e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 414.8125, "completions/min_length": 345.0, "epoch": 4.672058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.9628814458847046, "kl": 0.008312752703204751, "learning_rate": 9.466049378696881e-07, "loss": 8.157212141668424e-05, "reward": 0.885937511920929, "reward_std": 0.2112291157245636, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 3177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 497.8125, "completions/min_length": 424.0, "epoch": 4.673529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9583343267440796, "kl": 0.017979084397666156, "learning_rate": 9.465472194963286e-07, "loss": 0.0001772907271515578, "reward": 0.7401250004768372, "reward_std": 0.16039831936359406, "rewards/DrugCombAccuracyCOTORM/mean": 0.6868749856948853, "rewards/DrugCombAccuracyCOTORM/std": 0.41749998927116394, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 3178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 478.125, "completions/min_length": 425.0, "epoch": 4.675, "frac_reward_zero_std": 0.5, "grad_norm": 0.9838724136352539, "kl": 0.008018436841666698, "learning_rate": 9.464894717055685e-07, "loss": 8.0060213804245e-05, "reward": 0.6875, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 3179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 473.5625, "completions/min_length": 349.0, "epoch": 4.676470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9280577301979065, "kl": 0.006328164483420551, "learning_rate": 9.464316945012119e-07, "loss": 6.350461626425385e-05, "reward": 0.5687500238418579, "reward_std": 0.03720119222998619, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.6020797491073608, "step": 3180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 430.1875, "completions/min_length": 329.0, "epoch": 4.677941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.017956534400582314, "kl": 0.007430948666296899, "learning_rate": 9.463738878870649e-07, "loss": 7.535985787399113e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/mean_length": 493.5, "completions/min_length": 396.0, "epoch": 4.679411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 1.46542227268219, "kl": 0.009218574035912752, "learning_rate": 9.463160518669359e-07, "loss": 9.249895811080933e-05, "reward": 0.125, "reward_std": 0.1849764883518219, "rewards/DrugCombAccuracyCOTORM/mean": 0.0625, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.25, "rewards/DrugCombCoverageCOTORM/std": 0.5773502588272095, "step": 3182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/mean_length": 486.0, "completions/min_length": 387.0, "epoch": 4.680882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.9787301421165466, "kl": 0.004941839724779129, "learning_rate": 9.462581864446348e-07, "loss": 4.9728783778846264e-05, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 3183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 490.75, "completions/min_length": 423.0, "epoch": 4.682352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.019024327397346497, "kl": 0.006244562449865043, "learning_rate": 9.462002916239738e-07, "loss": 6.245380063774064e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/mean_length": 483.8125, "completions/min_length": 433.0, "epoch": 4.6838235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 0.7404001355171204, "kl": 0.0063262422336265445, "learning_rate": 9.461423674087669e-07, "loss": 6.342298001982272e-05, "reward": 0.581250011920929, "reward_std": 0.17100021243095398, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3125, "rewards/DrugCombCoverageCOTORM/std": 0.8732125163078308, "step": 3185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 468.1875, "completions/min_length": 412.0, "epoch": 4.685294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.4836273193359375, "kl": 0.008245311095379293, "learning_rate": 9.460844138028296e-07, "loss": 8.226931095123291e-05, "reward": 0.8552083373069763, "reward_std": 0.268846333026886, "rewards/DrugCombAccuracyCOTORM/mean": 0.8229166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.3520771861076355, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 3186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/mean_length": 537.8125, "completions/min_length": 471.0, "epoch": 4.686764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.1978081464767456, "kl": 0.0061369737377390265, "learning_rate": 9.460264308099803e-07, "loss": 6.0871243476867676e-05, "reward": 0.6734083294868469, "reward_std": 0.32787710428237915, "rewards/DrugCombAccuracyCOTORM/mean": 0.6191041469573975, "rewards/DrugCombAccuracyCOTORM/std": 0.44969314336776733, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.78125, "rewards/DrugCombCoverageCOTORM/std": 0.2963903248310089, "step": 3187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 503.375, "completions/min_length": 432.0, "epoch": 4.688235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9704116582870483, "kl": 0.007529188529588282, "learning_rate": 9.459684184340384e-07, "loss": 7.505714893341064e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 3188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 444.6875, "completions/min_length": 381.0, "epoch": 4.689705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.015155882574617863, "kl": 0.0054669888922944665, "learning_rate": 9.459103766788257e-07, "loss": 5.439308733912185e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 471.75, "completions/min_length": 379.0, "epoch": 4.6911764705882355, "frac_reward_zero_std": 0.5, "grad_norm": 0.7761965394020081, "kl": 0.007215007324703038, "learning_rate": 9.458523055481657e-07, "loss": 7.294421084225178e-05, "reward": 0.8296874761581421, "reward_std": 0.23508521914482117, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 3190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 414.1875, "completions/min_length": 357.0, "epoch": 4.692647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.9894468784332275, "kl": 0.005054602224845439, "learning_rate": 9.457942050458843e-07, "loss": 5.060434341430664e-05, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 3191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 453.875, "completions/min_length": 389.0, "epoch": 4.694117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.9868329763412476, "kl": 0.00632149213925004, "learning_rate": 9.457360751758088e-07, "loss": 6.377696990966797e-05, "reward": 0.7875000238418579, "reward_std": 0.22951814532279968, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 491.5625, "completions/min_length": 446.0, "epoch": 4.695588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.9134700298309326, "kl": 0.007370593608357012, "learning_rate": 9.456779159417685e-07, "loss": 7.382221519947052e-05, "reward": 0.8870925307273865, "reward_std": 0.0720701515674591, "rewards/DrugCombAccuracyCOTORM/mean": 0.8666781783103943, "rewards/DrugCombAccuracyCOTORM/std": 0.18051154911518097, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.08333335071802139, "step": 3193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 516.0, "completions/min_length": 452.0, "epoch": 4.697058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.0350605249404907, "kl": 0.006385218584910035, "learning_rate": 9.456197273475952e-07, "loss": 6.414872041204944e-05, "reward": 0.6456925272941589, "reward_std": 0.05196129530668259, "rewards/DrugCombAccuracyCOTORM/mean": 0.5766469240188599, "rewards/DrugCombAccuracyCOTORM/std": 0.44576725363731384, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.18225695192813873, "step": 3194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 428.8125, "completions/min_length": 376.0, "epoch": 4.698529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.01378575898706913, "kl": 0.005088289151899517, "learning_rate": 9.455615093971219e-07, "loss": 5.0360788009129465e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/mean_length": 528.6875, "completions/min_length": 464.0, "epoch": 4.7, "frac_reward_zero_std": 0.5, "grad_norm": 0.9149236679077148, "kl": 0.005820356891490519, "learning_rate": 9.455032620941839e-07, "loss": 5.805492401123047e-05, "reward": 0.8064583539962769, "reward_std": 0.19289778172969818, "rewards/DrugCombAccuracyCOTORM/mean": 0.7958333492279053, "rewards/DrugCombAccuracyCOTORM/std": 0.33205869793891907, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6979166269302368, "rewards/DrugCombCoverageCOTORM/std": 0.6672740578651428, "step": 3196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 416.3125, "completions/min_length": 366.0, "epoch": 4.701470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8361173272132874, "kl": 0.007383808144368231, "learning_rate": 9.454449854426186e-07, "loss": 7.34180212020874e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 454.3125, "completions/min_length": 430.0, "epoch": 4.702941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9147619009017944, "kl": 0.006479802425019443, "learning_rate": 9.453866794462646e-07, "loss": 6.431825750041753e-05, "reward": 0.8500000238418579, "reward_std": 0.20701967179775238, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 434.125, "completions/min_length": 386.0, "epoch": 4.704411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.02916291542351246, "kl": 0.0075667607598006725, "learning_rate": 9.453283441089636e-07, "loss": 7.605056453030556e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 424.6875, "completions/min_length": 328.0, "epoch": 4.705882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.018635360524058342, "kl": 0.00650681700790301, "learning_rate": 9.452699794345581e-07, "loss": 6.506044155685231e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 434.0625, "completions/min_length": 390.0, "epoch": 4.70735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.012400762178003788, "kl": 0.007693406078033149, "learning_rate": 9.452115854268932e-07, "loss": 7.692062354180962e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 446.0625, "completions/min_length": 375.0, "epoch": 4.708823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.00831427238881588, "kl": 0.004785605706274509, "learning_rate": 9.451531620898159e-07, "loss": 4.819571040570736e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 437.25, "completions/min_length": 403.0, "epoch": 4.7102941176470585, "frac_reward_zero_std": 1.0, "grad_norm": 0.023448744788765907, "kl": 0.006296572624705732, "learning_rate": 9.450947094271746e-07, "loss": 6.283920083660632e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 451.1875, "completions/min_length": 387.0, "epoch": 4.711764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.012061979621648788, "kl": 0.006993997609242797, "learning_rate": 9.450362274428205e-07, "loss": 7.04045087331906e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 482.9375, "completions/min_length": 363.0, "epoch": 4.713235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8659268617630005, "kl": 0.0072728905361145735, "learning_rate": 9.449777161406059e-07, "loss": 7.296353578567505e-05, "reward": 0.7577222585678101, "reward_std": 0.11736807972192764, "rewards/DrugCombAccuracyCOTORM/mean": 0.7318750023841858, "rewards/DrugCombAccuracyCOTORM/std": 0.3388061225414276, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7222222089767456, "rewards/DrugCombCoverageCOTORM/std": 0.31426969170570374, "step": 3205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 438.4375, "completions/min_length": 378.0, "epoch": 4.714705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.011764278635382652, "kl": 0.005275889183394611, "learning_rate": 9.449191755243852e-07, "loss": 5.288971442496404e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 463.3125, "completions/min_length": 371.0, "epoch": 4.716176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9412538409233093, "kl": 0.00752977526281029, "learning_rate": 9.448606055980155e-07, "loss": 7.501845539081842e-05, "reward": 0.8067083358764648, "reward_std": 0.022174540907144547, "rewards/DrugCombAccuracyCOTORM/mean": 0.768151044845581, "rewards/DrugCombAccuracyCOTORM/std": 0.2439415454864502, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.921875, "rewards/DrugCombCoverageCOTORM/std": 0.1280868947505951, "step": 3207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 433.875, "completions/min_length": 395.0, "epoch": 4.7176470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.143510341644287, "kl": 0.007720446097664535, "learning_rate": 9.448020063653548e-07, "loss": 7.697725959587842e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 3208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 423.75, "completions/min_length": 350.0, "epoch": 4.719117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.019114665687084198, "kl": 0.009567035245709121, "learning_rate": 9.447433778302634e-07, "loss": 9.26120555959642e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/mean_length": 487.625, "completions/min_length": 388.0, "epoch": 4.720588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.019809041172266006, "kl": 0.006741196848452091, "learning_rate": 9.44684719996604e-07, "loss": 6.778042734367773e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 486.3125, "completions/min_length": 406.0, "epoch": 4.722058823529411, "frac_reward_zero_std": 0.0, "grad_norm": 1.584505558013916, "kl": 0.009322967263869941, "learning_rate": 9.446260328682405e-07, "loss": 9.244680404663086e-05, "reward": 0.375, "reward_std": 0.403614342212677, "rewards/DrugCombAccuracyCOTORM/mean": 0.28125, "rewards/DrugCombAccuracyCOTORM/std": 0.44604745507240295, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 3211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 469.0625, "completions/min_length": 427.0, "epoch": 4.723529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.011875642463564873, "kl": 0.0052982905763201416, "learning_rate": 9.445673164490393e-07, "loss": 5.316318856785074e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/mean_length": 519.625, "completions/min_length": 418.0, "epoch": 4.725, "frac_reward_zero_std": 0.0, "grad_norm": 1.3260260820388794, "kl": 0.007783964974805713, "learning_rate": 9.445085707428682e-07, "loss": 7.843971252441406e-05, "reward": 0.7745833396911621, "reward_std": 0.34554338455200195, "rewards/DrugCombAccuracyCOTORM/mean": 0.762499988079071, "rewards/DrugCombAccuracyCOTORM/std": 0.36492007970809937, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6458333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5769491791725159, "step": 3213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 434.625, "completions/min_length": 384.0, "epoch": 4.726470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.008272051811218262, "kl": 0.005422710906714201, "learning_rate": 9.444497957535974e-07, "loss": 5.4479089158121496e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 463.75, "completions/min_length": 376.0, "epoch": 4.727941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.011232113465666771, "kl": 0.006373112206347287, "learning_rate": 9.443909914850988e-07, "loss": 6.370996561599895e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 478.125, "completions/min_length": 437.0, "epoch": 4.729411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.018427135422825813, "kl": 0.010181745281443, "learning_rate": 9.443321579412463e-07, "loss": 0.00010142850806005299, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 455.1875, "completions/min_length": 421.0, "epoch": 4.730882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.014020020142197609, "kl": 0.007586640538647771, "learning_rate": 9.442732951259158e-07, "loss": 7.588946755276993e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 493.1875, "completions/min_length": 440.0, "epoch": 4.732352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.6651023626327515, "kl": 0.007169871241785586, "learning_rate": 9.442144030429849e-07, "loss": 7.187575101852417e-05, "reward": 0.7279666662216187, "reward_std": 0.2968941330909729, "rewards/DrugCombAccuracyCOTORM/mean": 0.6703749895095825, "rewards/DrugCombAccuracyCOTORM/std": 0.40814507007598877, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.14907118678092957, "step": 3218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 475.6875, "completions/min_length": 428.0, "epoch": 4.733823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.01685032807290554, "kl": 0.009275504387915134, "learning_rate": 9.441554816963332e-07, "loss": 9.326692088507116e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 442.0625, "completions/min_length": 403.0, "epoch": 4.735294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9675278663635254, "kl": 0.00950729160103947, "learning_rate": 9.440965310898424e-07, "loss": 9.529254020890221e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/mean_length": 485.125, "completions/min_length": 405.0, "epoch": 4.7367647058823525, "frac_reward_zero_std": 0.5, "grad_norm": 0.8355920314788818, "kl": 0.007937342626973987, "learning_rate": 9.44037551227396e-07, "loss": 7.783013279549778e-05, "reward": 0.9114583730697632, "reward_std": 0.0733194574713707, "rewards/DrugCombAccuracyCOTORM/mean": 0.8958333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.15957117080688477, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9479166269302368, "rewards/DrugCombCoverageCOTORM/std": 0.07978560030460358, "step": 3221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/mean_length": 462.4375, "completions/min_length": 380.0, "epoch": 4.738235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0542088747024536, "kl": 0.00773995544295758, "learning_rate": 9.439785421128795e-07, "loss": 7.716889376752079e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/mean_length": 497.0625, "completions/min_length": 361.0, "epoch": 4.739705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 0.9351543188095093, "kl": 0.007940475479699671, "learning_rate": 9.439195037501801e-07, "loss": 7.940363138914108e-05, "reward": 0.907469630241394, "reward_std": 0.0836985856294632, "rewards/DrugCombAccuracyCOTORM/mean": 0.8856391310691833, "rewards/DrugCombAccuracyCOTORM/std": 0.1853298395872116, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9895833730697632, "rewards/DrugCombCoverageCOTORM/std": 0.02846374548971653, "step": 3223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 409.6875, "completions/min_length": 320.0, "epoch": 4.741176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.115280270576477, "kl": 0.007423259899951518, "learning_rate": 9.438604361431875e-07, "loss": 7.390975952148438e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/mean_length": 381.8125, "completions/min_length": 337.0, "epoch": 4.742647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.1658573150634766, "kl": 0.00580002466449514, "learning_rate": 9.438013392957923e-07, "loss": 5.744502414017916e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 439.4375, "completions/min_length": 397.0, "epoch": 4.7441176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.014261680655181408, "kl": 0.006602339679375291, "learning_rate": 9.437422132118881e-07, "loss": 6.61137601127848e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 444.8125, "completions/min_length": 378.0, "epoch": 4.745588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.01677548699080944, "kl": 0.0068697750102728605, "learning_rate": 9.4368305789537e-07, "loss": 6.941620813449845e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 449.1875, "completions/min_length": 374.0, "epoch": 4.747058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.1671233177185059, "kl": 0.007092580432072282, "learning_rate": 9.436238733501347e-07, "loss": 7.057653419906273e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 460.9375, "completions/min_length": 403.0, "epoch": 4.748529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.009995303116738796, "kl": 0.005539822275750339, "learning_rate": 9.435646595800815e-07, "loss": 5.562769729294814e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 465.25, "completions/min_length": 402.0, "epoch": 4.75, "frac_reward_zero_std": 0.5, "grad_norm": 0.9439179301261902, "kl": 0.009939766372554004, "learning_rate": 9.435054165891108e-07, "loss": 9.924719051923603e-05, "reward": 0.7352083325386047, "reward_std": 0.08815589547157288, "rewards/DrugCombAccuracyCOTORM/mean": 0.6937500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.35352590680122375, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8020833730697632, "rewards/DrugCombCoverageCOTORM/std": 0.21273136138916016, "step": 3230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 427.9375, "completions/min_length": 363.0, "epoch": 4.751470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.012602453120052814, "kl": 0.005596755538135767, "learning_rate": 9.434461443811257e-07, "loss": 5.535278614843264e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 410.125, "completions/min_length": 369.0, "epoch": 4.752941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.06723227351903915, "kl": 0.008654075558297336, "learning_rate": 9.433868429600309e-07, "loss": 8.568474731873721e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 470.9375, "completions/min_length": 419.0, "epoch": 4.754411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9718402624130249, "kl": 0.006278068874962628, "learning_rate": 9.43327512329733e-07, "loss": 6.285309791564941e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 456.875, "completions/min_length": 402.0, "epoch": 4.7558823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.028516171500086784, "kl": 0.0066884871339425445, "learning_rate": 9.432681524941404e-07, "loss": 6.536273576784879e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 449.375, "completions/min_length": 385.0, "epoch": 4.757352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.0014007091522217, "kl": 0.006472119130194187, "learning_rate": 9.432087634571637e-07, "loss": 6.42240047454834e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 445.125, "completions/min_length": 387.0, "epoch": 4.758823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.1941646337509155, "kl": 0.008700738311745226, "learning_rate": 9.431493452227153e-07, "loss": 8.709379471838474e-05, "reward": 0.737500011920929, "reward_std": 0.2199837565422058, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/mean_length": 394.9375, "completions/min_length": 311.0, "epoch": 4.760294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.018505018204450607, "kl": 0.007029845262877643, "learning_rate": 9.430898977947095e-07, "loss": 7.16372742317617e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 445.8125, "completions/min_length": 379.0, "epoch": 4.761764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.012219410389661789, "kl": 0.007726228213869035, "learning_rate": 9.430304211770625e-07, "loss": 7.721639121882617e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 734.0, "completions/mean_length": 520.25, "completions/min_length": 386.0, "epoch": 4.7632352941176475, "frac_reward_zero_std": 0.5, "grad_norm": 0.7956664562225342, "kl": 0.007667417638003826, "learning_rate": 9.429709153736927e-07, "loss": 7.611791079398245e-05, "reward": 0.9035309553146362, "reward_std": 0.15553687512874603, "rewards/DrugCombAccuracyCOTORM/mean": 0.887374997138977, "rewards/DrugCombAccuracyCOTORM/std": 0.2730875313282013, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9363095164299011, "rewards/DrugCombCoverageCOTORM/std": 0.16295526921749115, "step": 3239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/mean_length": 493.125, "completions/min_length": 419.0, "epoch": 4.764705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.046104073524475, "kl": 0.00772908132057637, "learning_rate": 9.429113803885198e-07, "loss": 7.768630894133821e-05, "reward": 0.65625, "reward_std": 0.21223722398281097, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 0.875, "rewards/DrugCombCOTFormatORM/std": 0.22360680997371674, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 483.8125, "completions/min_length": 398.0, "epoch": 4.766176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.899236798286438, "kl": 0.006557086366228759, "learning_rate": 9.428518162254662e-07, "loss": 6.590827251784503e-05, "reward": 0.6854166984558105, "reward_std": 0.19526894390583038, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.3435921370983124, "step": 3241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 478.5625, "completions/min_length": 413.0, "epoch": 4.767647058823529, "frac_reward_zero_std": 0.0, "grad_norm": 2.0756258964538574, "kl": 0.010125968605279922, "learning_rate": 9.427922228884557e-07, "loss": 0.0001025274395942688, "reward": 0.3944583535194397, "reward_std": 0.17997314035892487, "rewards/DrugCombAccuracyCOTORM/mean": 0.3016666769981384, "rewards/DrugCombAccuracyCOTORM/std": 0.2996998727321625, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.53125, "rewards/DrugCombCoverageCOTORM/std": 0.5313112735748291, "step": 3242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/mean_length": 559.3125, "completions/min_length": 493.0, "epoch": 4.769117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 1.1979303359985352, "kl": 0.007113370578736067, "learning_rate": 9.42732600381414e-07, "loss": 7.107853889465332e-05, "reward": 0.5968062877655029, "reward_std": 0.34577977657318115, "rewards/DrugCombAccuracyCOTORM/mean": 0.5361120700836182, "rewards/DrugCombAccuracyCOTORM/std": 0.39865654706954956, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6791666746139526, "rewards/DrugCombCoverageCOTORM/std": 0.6715239882469177, "step": 3243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 427.8125, "completions/min_length": 395.0, "epoch": 4.770588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.015811514109373093, "kl": 0.006038003950379789, "learning_rate": 9.42672948708269e-07, "loss": 6.043786197551526e-05, "reward": 0.6713333129882812, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6100000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.40279027819633484, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 3244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 470.3125, "completions/min_length": 414.0, "epoch": 4.772058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.8452519178390503, "kl": 0.005789946764707565, "learning_rate": 9.426132678729504e-07, "loss": 5.820133083034307e-05, "reward": 0.7585208415985107, "reward_std": 0.15322764217853546, "rewards/DrugCombAccuracyCOTORM/mean": 0.7189843654632568, "rewards/DrugCombAccuracyCOTORM/std": 0.3925682604312897, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.27216553688049316, "step": 3245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 450.875, "completions/min_length": 349.0, "epoch": 4.773529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.010035831481218338, "kl": 0.005755960941314697, "learning_rate": 9.425535578793899e-07, "loss": 5.7411114539718255e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 469.3125, "completions/min_length": 402.0, "epoch": 4.775, "frac_reward_zero_std": 1.0, "grad_norm": 0.026098767295479774, "kl": 0.00783392135053873, "learning_rate": 9.424938187315209e-07, "loss": 7.816431752871722e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 3247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 464.6875, "completions/min_length": 398.0, "epoch": 4.776470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.4182825088500977, "kl": 0.012688154471106827, "learning_rate": 9.42434050433279e-07, "loss": 0.00012639947817660868, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 443.5625, "completions/min_length": 389.0, "epoch": 4.777941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.013165797106921673, "kl": 0.00688932603225112, "learning_rate": 9.423742529886014e-07, "loss": 6.877521809656173e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 3249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 438.625, "completions/min_length": 379.0, "epoch": 4.779411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.221637487411499, "kl": 0.007580453879199922, "learning_rate": 9.423144264014276e-07, "loss": 7.683382136747241e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/mean_length": 533.1875, "completions/min_length": 446.0, "epoch": 4.780882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.0171939134597778, "kl": 0.008302429341711104, "learning_rate": 9.422545706756988e-07, "loss": 8.277889719465747e-05, "reward": 0.8676249980926514, "reward_std": 0.09157034754753113, "rewards/DrugCombAccuracyCOTORM/mean": 0.8384374976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.21864522993564606, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 3251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 461.875, "completions/min_length": 412.0, "epoch": 4.7823529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.8935535550117493, "kl": 0.007944423938170075, "learning_rate": 9.42194685815358e-07, "loss": 7.991726306499913e-05, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 3252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 432.0625, "completions/min_length": 375.0, "epoch": 4.783823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.7935727834701538, "kl": 0.008127909968607128, "learning_rate": 9.421347718243504e-07, "loss": 8.21053545223549e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 480.3125, "completions/min_length": 424.0, "epoch": 4.785294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.432052731513977, "kl": 0.007124423980712891, "learning_rate": 9.42074828706623e-07, "loss": 7.133558392524719e-05, "reward": 0.5767499804496765, "reward_std": 0.35526520013809204, "rewards/DrugCombAccuracyCOTORM/mean": 0.47874999046325684, "rewards/DrugCombAccuracyCOTORM/std": 0.48152363300323486, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.13437095284461975, "step": 3254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/mean_length": 464.5, "completions/min_length": 365.0, "epoch": 4.786764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.01424937229603529, "kl": 0.010190603439696133, "learning_rate": 9.420148564661246e-07, "loss": 0.00010236861271550879, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 431.8125, "completions/min_length": 382.0, "epoch": 4.788235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.015608461573719978, "kl": 0.0059292708756402135, "learning_rate": 9.41954855106806e-07, "loss": 5.911898915655911e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/mean_length": 494.375, "completions/min_length": 415.0, "epoch": 4.7897058823529415, "frac_reward_zero_std": 0.0, "grad_norm": 1.6673136949539185, "kl": 0.006869807722978294, "learning_rate": 9.418948246326199e-07, "loss": 6.820261478424072e-05, "reward": 0.30533331632614136, "reward_std": 0.1919218748807907, "rewards/DrugCombAccuracyCOTORM/mean": 0.1525000035762787, "rewards/DrugCombAccuracyCOTORM/std": 0.2857387959957123, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 3257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 415.25, "completions/min_length": 371.0, "epoch": 4.791176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.05348397418856621, "kl": 0.008520354051142931, "learning_rate": 9.418347650475212e-07, "loss": 8.559056732337922e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/mean_length": 481.4375, "completions/min_length": 439.0, "epoch": 4.79264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9572413563728333, "kl": 0.007120138383470476, "learning_rate": 9.417746763554663e-07, "loss": 7.092207670211792e-05, "reward": 0.910812497138977, "reward_std": 0.16675344109535217, "rewards/DrugCombAccuracyCOTORM/mean": 0.8904687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.30268827080726624, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.0625, "step": 3259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 448.375, "completions/min_length": 404.0, "epoch": 4.794117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.009900875389575958, "kl": 0.0057982755824923515, "learning_rate": 9.417145585604138e-07, "loss": 5.837494973093271e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 443.8125, "completions/min_length": 374.0, "epoch": 4.795588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.008017471060156822, "kl": 0.005726678064092994, "learning_rate": 9.416544116663239e-07, "loss": 5.723742287955247e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 437.9375, "completions/min_length": 367.0, "epoch": 4.797058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.01485524233430624, "kl": 0.007599484408274293, "learning_rate": 9.415942356771591e-07, "loss": 7.53020285628736e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 502.1875, "completions/min_length": 472.0, "epoch": 4.798529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.014227569103240967, "kl": 0.007888837833888829, "learning_rate": 9.415340305968836e-07, "loss": 7.892373832874e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 456.5, "completions/min_length": 409.0, "epoch": 4.8, "frac_reward_zero_std": 0.5, "grad_norm": 1.1578179597854614, "kl": 0.008385596796870232, "learning_rate": 9.414737964294634e-07, "loss": 8.387863636016846e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 433.75, "completions/min_length": 376.0, "epoch": 4.801470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.0482085943222046, "kl": 0.006168742896988988, "learning_rate": 9.414135331788669e-07, "loss": 6.12088042544201e-05, "reward": 0.6383333206176758, "reward_std": 0.045544322580099106, "rewards/DrugCombAccuracyCOTORM/mean": 0.5687500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.45213383436203003, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 3265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 488.3125, "completions/min_length": 449.0, "epoch": 4.802941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.3499985933303833, "kl": 0.008875779225490987, "learning_rate": 9.413532408490639e-07, "loss": 8.86860434547998e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 3266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 439.8125, "completions/min_length": 390.0, "epoch": 4.804411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.012152845039963722, "kl": 0.006683178362436593, "learning_rate": 9.412929194440262e-07, "loss": 6.63273167447187e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 458.6875, "completions/min_length": 397.0, "epoch": 4.805882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.0311909914016724, "kl": 0.00798983033746481, "learning_rate": 9.412325689677278e-07, "loss": 8.007397991605103e-05, "reward": 0.8229166269302368, "reward_std": 0.24574756622314453, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7291666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.6800735592842102, "step": 3268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 461.1875, "completions/min_length": 426.0, "epoch": 4.807352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.6669285297393799, "kl": 0.009311360074207187, "learning_rate": 9.411721894241444e-07, "loss": 9.37730073928833e-05, "reward": 0.8500000238418579, "reward_std": 0.2314550131559372, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 3269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 528.9375, "completions/min_length": 465.0, "epoch": 4.8088235294117645, "frac_reward_zero_std": 0.0, "grad_norm": 1.3450015783309937, "kl": 0.006347095826640725, "learning_rate": 9.411117808172535e-07, "loss": 6.332993507385254e-05, "reward": 0.6567299962043762, "reward_std": 0.24938081204891205, "rewards/DrugCombAccuracyCOTORM/mean": 0.584975004196167, "rewards/DrugCombAccuracyCOTORM/std": 0.434963583946228, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8875000476837158, "rewards/DrugCombCoverageCOTORM/std": 0.12583057582378387, "step": 3270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 424.9375, "completions/min_length": 378.0, "epoch": 4.810294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.022677553817629814, "kl": 0.0068892245180904865, "learning_rate": 9.41051343151035e-07, "loss": 6.853399827377871e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 417.875, "completions/min_length": 359.0, "epoch": 4.811764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.007339754607528448, "kl": 0.00542562804184854, "learning_rate": 9.409908764294701e-07, "loss": 5.4245996579993516e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 487.4375, "completions/min_length": 424.0, "epoch": 4.813235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.512513279914856, "kl": 0.010029203025624156, "learning_rate": 9.409303806565423e-07, "loss": 0.00010044127702713013, "reward": 0.40082597732543945, "reward_std": 0.08047844469547272, "rewards/DrugCombAccuracyCOTORM/mean": 0.37681370973587036, "rewards/DrugCombAccuracyCOTORM/std": 0.39506861567497253, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.00625002384185791, "rewards/DrugCombCoverageCOTORM/std": 0.9102884531021118, "step": 3273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 446.8125, "completions/min_length": 406.0, "epoch": 4.814705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.0659526586532593, "kl": 0.006611217511817813, "learning_rate": 9.408698558362366e-07, "loss": 6.55805051792413e-05, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 3274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 435.3125, "completions/min_length": 413.0, "epoch": 4.8161764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.01734168268740177, "kl": 0.006123744766227901, "learning_rate": 9.408093019725409e-07, "loss": 6.125136860646307e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 431.5, "completions/min_length": 371.0, "epoch": 4.817647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.009781341068446636, "kl": 0.0068378448486328125, "learning_rate": 9.407487190694437e-07, "loss": 6.813638174207881e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 441.25, "completions/min_length": 395.0, "epoch": 4.819117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.9331456422805786, "kl": 0.006389393354766071, "learning_rate": 9.406881071309364e-07, "loss": 6.362795829772949e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/mean_length": 579.9375, "completions/min_length": 438.0, "epoch": 4.820588235294117, "frac_reward_zero_std": 0.0, "grad_norm": 1.1200993061065674, "kl": 0.006945088622160256, "learning_rate": 9.406274661610119e-07, "loss": 6.995722651481628e-05, "reward": 0.6411111354827881, "reward_std": 0.2741648554801941, "rewards/DrugCombAccuracyCOTORM/mean": 0.5583333373069763, "rewards/DrugCombAccuracyCOTORM/std": 0.33188575506210327, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9444444179534912, "rewards/DrugCombCoverageCOTORM/std": 0.09938079863786697, "step": 3278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 457.75, "completions/min_length": 389.0, "epoch": 4.822058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.01241912879049778, "kl": 0.007077526301145554, "learning_rate": 9.405667961636647e-07, "loss": 7.016905874479562e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/mean_length": 494.625, "completions/min_length": 435.0, "epoch": 4.823529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.220910668373108, "kl": 0.009690933860838413, "learning_rate": 9.405060971428922e-07, "loss": 9.751319885253906e-05, "reward": 0.21435417234897614, "reward_std": 0.031053775921463966, "rewards/DrugCombAccuracyCOTORM/mean": 0.06416666507720947, "rewards/DrugCombAccuracyCOTORM/std": 0.07514429837465286, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6458333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.37453675270080566, "step": 3280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 437.6875, "completions/min_length": 386.0, "epoch": 4.825, "frac_reward_zero_std": 1.0, "grad_norm": 0.009722771123051643, "kl": 0.00675880431663245, "learning_rate": 9.404453691026928e-07, "loss": 6.822415889473632e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 3281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 467.1875, "completions/min_length": 377.0, "epoch": 4.826470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.5550962686538696, "kl": 0.008350052521564066, "learning_rate": 9.40384612047067e-07, "loss": 8.368492126464844e-05, "reward": 0.8416666984558105, "reward_std": 0.2346440553665161, "rewards/DrugCombAccuracyCOTORM/mean": 0.8333333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.27216553688049316, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 3282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 418.3125, "completions/min_length": 335.0, "epoch": 4.827941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.8904531598091125, "kl": 0.007925695041194558, "learning_rate": 9.403238259800175e-07, "loss": 7.953020394779742e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 423.1875, "completions/min_length": 317.0, "epoch": 4.829411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.6556360721588135, "kl": 0.006106103886850178, "learning_rate": 9.402630109055485e-07, "loss": 6.148219108581543e-05, "reward": 0.7992666959762573, "reward_std": 0.3585401177406311, "rewards/DrugCombAccuracyCOTORM/mean": 0.7720000147819519, "rewards/DrugCombAccuracyCOTORM/std": 0.40872809290885925, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8166666626930237, "rewards/DrugCombCoverageCOTORM/std": 0.33774852752685547, "step": 3284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 464.375, "completions/min_length": 435.0, "epoch": 4.830882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.04674144089221954, "kl": 0.0072735571302473545, "learning_rate": 9.402021668276667e-07, "loss": 7.099382492015138e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 420.5, "completions/min_length": 325.0, "epoch": 4.83235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.007429639343172312, "kl": 0.005072985077276826, "learning_rate": 9.401412937503801e-07, "loss": 5.054222492617555e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 432.0625, "completions/min_length": 382.0, "epoch": 4.833823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.008887712843716145, "kl": 0.006099250284023583, "learning_rate": 9.400803916776989e-07, "loss": 6.084107735659927e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/mean_length": 502.1875, "completions/min_length": 434.0, "epoch": 4.8352941176470585, "frac_reward_zero_std": 0.0, "grad_norm": 1.3755605220794678, "kl": 0.008852340281009674, "learning_rate": 9.400194606136352e-07, "loss": 8.746981620788574e-05, "reward": 0.45925000309944153, "reward_std": 0.393064022064209, "rewards/DrugCombAccuracyCOTORM/mean": 0.3832142949104309, "rewards/DrugCombAccuracyCOTORM/std": 0.436936616897583, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5267857313156128, "rewards/DrugCombCoverageCOTORM/std": 0.4098075032234192, "step": 3288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 422.4375, "completions/min_length": 364.0, "epoch": 4.836764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.1669732332229614, "kl": 0.008855449734255672, "learning_rate": 9.39958500562203e-07, "loss": 8.871099271345884e-05, "reward": 0.737500011920929, "reward_std": 0.219983771443367, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/mean_length": 483.625, "completions/min_length": 401.0, "epoch": 4.838235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.2070441246032715, "kl": 0.006697417818941176, "learning_rate": 9.398975115274183e-07, "loss": 6.652623414993286e-05, "reward": 0.5573541522026062, "reward_std": 0.33002951741218567, "rewards/DrugCombAccuracyCOTORM/mean": 0.5137500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.5050000548362732, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.7790664434432983, "step": 3290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 423.25, "completions/min_length": 364.0, "epoch": 4.839705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.03370809182524681, "kl": 0.009845071821473539, "learning_rate": 9.398364935132986e-07, "loss": 9.837142715696245e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/mean_length": 398.75, "completions/min_length": 369.0, "epoch": 4.841176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 1.547120213508606, "kl": 0.00844297162257135, "learning_rate": 9.397754465238639e-07, "loss": 8.367002010345459e-05, "reward": 0.824999988079071, "reward_std": 0.37287637591362, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 3292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 425.0625, "completions/min_length": 394.0, "epoch": 4.8426470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.012349041178822517, "kl": 0.006937231519259512, "learning_rate": 9.397143705631357e-07, "loss": 6.976366421440616e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 460.3125, "completions/min_length": 417.0, "epoch": 4.844117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.174986481666565, "kl": 0.007548900204710662, "learning_rate": 9.396532656351374e-07, "loss": 7.617331721121445e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 438.8125, "completions/min_length": 392.0, "epoch": 4.845588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.131027102470398, "kl": 0.0063145398162305355, "learning_rate": 9.395921317438948e-07, "loss": 6.306893192231655e-05, "reward": 0.8964166641235352, "reward_std": 0.19718945026397705, "rewards/DrugCombAccuracyCOTORM/mean": 0.8887500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.30663496255874634, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5013870000839233, "step": 3295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 436.1875, "completions/min_length": 394.0, "epoch": 4.847058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.008896978572010994, "kl": 0.005788304028101265, "learning_rate": 9.39530968893435e-07, "loss": 5.78389190195594e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 420.375, "completions/min_length": 380.0, "epoch": 4.848529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.03669050708413124, "kl": 0.009704157477244735, "learning_rate": 9.394697770877871e-07, "loss": 9.734174818731844e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 449.8125, "completions/min_length": 418.0, "epoch": 4.85, "frac_reward_zero_std": 1.0, "grad_norm": 0.010468235239386559, "kl": 0.00693384453188628, "learning_rate": 9.394085563309826e-07, "loss": 6.941512401681393e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 464.0, "completions/min_length": 391.0, "epoch": 4.851470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.045541584491729736, "kl": 0.01126385957468301, "learning_rate": 9.393473066270545e-07, "loss": 0.00011198240099474788, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 484.375, "completions/min_length": 452.0, "epoch": 4.852941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.014272543601691723, "kl": 0.007888863212428987, "learning_rate": 9.392860279800375e-07, "loss": 7.890082633821294e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 436.1875, "completions/min_length": 400.0, "epoch": 4.854411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.1852511167526245, "kl": 0.009243232663720846, "learning_rate": 9.392247203939687e-07, "loss": 9.281340317102149e-05, "reward": 0.4312500059604645, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 3301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 440.5625, "completions/min_length": 387.0, "epoch": 4.855882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.0287522077560425, "kl": 0.006383950123563409, "learning_rate": 9.391633838728869e-07, "loss": 6.392219074768946e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 446.3125, "completions/min_length": 373.0, "epoch": 4.857352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.021284205839037895, "kl": 0.009212526259943843, "learning_rate": 9.391020184208328e-07, "loss": 9.254710312234238e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 444.5, "completions/min_length": 386.0, "epoch": 4.858823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.94551020860672, "kl": 0.007674812921322882, "learning_rate": 9.390406240418489e-07, "loss": 7.628487219335511e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 489.75, "completions/min_length": 431.0, "epoch": 4.860294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.466209053993225, "kl": 0.007731477962806821, "learning_rate": 9.389792007399798e-07, "loss": 7.721781730651855e-05, "reward": 0.7585833072662354, "reward_std": 0.22490990161895752, "rewards/DrugCombAccuracyCOTORM/mean": 0.721666693687439, "rewards/DrugCombAccuracyCOTORM/std": 0.3167754113674164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.17078250646591187, "step": 3305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 469.4375, "completions/min_length": 427.0, "epoch": 4.8617647058823525, "frac_reward_zero_std": 0.5, "grad_norm": 0.9680900573730469, "kl": 0.006244459073059261, "learning_rate": 9.389177485192719e-07, "loss": 6.254017353057861e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 3306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 488.8125, "completions/min_length": 427.0, "epoch": 4.863235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.011370723135769367, "kl": 0.0056352384854108095, "learning_rate": 9.388562673837735e-07, "loss": 5.6215118092950433e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 3307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 471.0625, "completions/min_length": 418.0, "epoch": 4.864705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.01792612113058567, "kl": 0.007940144278109074, "learning_rate": 9.387947573375346e-07, "loss": 7.962203380884603e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 444.75, "completions/min_length": 372.0, "epoch": 4.866176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9159151315689087, "kl": 0.006511515355668962, "learning_rate": 9.387332183846077e-07, "loss": 6.41550068394281e-05, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 3309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 409.0625, "completions/min_length": 387.0, "epoch": 4.867647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.009873910807073116, "kl": 0.0059726034523919225, "learning_rate": 9.386716505290466e-07, "loss": 5.949490878265351e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 453.625, "completions/min_length": 385.0, "epoch": 4.8691176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.008780540898442268, "kl": 0.005835423246026039, "learning_rate": 9.386100537749074e-07, "loss": 5.8519777667243034e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 457.875, "completions/min_length": 412.0, "epoch": 4.870588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.8721215128898621, "kl": 0.00780607876367867, "learning_rate": 9.385484281262479e-07, "loss": 7.743353489786386e-05, "reward": 0.737500011920929, "reward_std": 0.219983771443367, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 443.9375, "completions/min_length": 334.0, "epoch": 4.872058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.7699684500694275, "kl": 0.005300916614942253, "learning_rate": 9.384867735871275e-07, "loss": 5.27501106262207e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 455.125, "completions/min_length": 378.0, "epoch": 4.873529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.9479492902755737, "kl": 0.007590242777951062, "learning_rate": 9.384250901616084e-07, "loss": 7.546693086624146e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/mean_length": 533.1875, "completions/min_length": 481.0, "epoch": 4.875, "frac_reward_zero_std": 0.5, "grad_norm": 0.8443490266799927, "kl": 0.0067211511777713895, "learning_rate": 9.383633778537539e-07, "loss": 6.754467904102057e-05, "reward": 0.8356666564941406, "reward_std": 0.17567972838878632, "rewards/DrugCombAccuracyCOTORM/mean": 0.8050000071525574, "rewards/DrugCombAccuracyCOTORM/std": 0.3488266170024872, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.14907118678092957, "step": 3315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 483.8125, "completions/min_length": 412.0, "epoch": 4.876470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.010929469019174576, "kl": 0.006552163045853376, "learning_rate": 9.383016366676293e-07, "loss": 6.5424814238213e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/mean_length": 537.375, "completions/min_length": 477.0, "epoch": 4.877941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.3971203565597534, "kl": 0.007990711135789752, "learning_rate": 9.38239866607302e-07, "loss": 8.106976747512817e-05, "reward": 0.5754166841506958, "reward_std": 0.2647639811038971, "rewards/DrugCombAccuracyCOTORM/mean": 0.518750011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.4888336658477783, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6041666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5335937142372131, "step": 3317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 475.5625, "completions/min_length": 439.0, "epoch": 4.879411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.009938955307006836, "kl": 0.006522642681375146, "learning_rate": 9.381780676768414e-07, "loss": 6.531726830871776e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/mean_length": 494.25, "completions/min_length": 413.0, "epoch": 4.8808823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.0827430486679077, "kl": 0.007539321552030742, "learning_rate": 9.381162398803186e-07, "loss": 7.589161396026611e-05, "reward": 0.8025000095367432, "reward_std": 0.18670706450939178, "rewards/DrugCombAccuracyCOTORM/mean": 0.800000011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.3265986442565918, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.6540472507476807, "step": 3319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/mean_length": 498.8125, "completions/min_length": 415.0, "epoch": 4.882352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.6400405168533325, "kl": 0.008037377148866653, "learning_rate": 9.380543832218068e-07, "loss": 8.12336802482605e-05, "reward": 0.6804649233818054, "reward_std": 0.3528653383255005, "rewards/DrugCombAccuracyCOTORM/mean": 0.6099561452865601, "rewards/DrugCombAccuracyCOTORM/std": 0.4915139973163605, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.925000011920929, "rewards/DrugCombCoverageCOTORM/std": 0.25166115164756775, "step": 3320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 446.5, "completions/min_length": 369.0, "epoch": 4.883823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.010893171653151512, "kl": 0.006793041015043855, "learning_rate": 9.379924977053806e-07, "loss": 6.746578583260998e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 431.9375, "completions/min_length": 334.0, "epoch": 4.885294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.04735530540347099, "kl": 0.007318735588341951, "learning_rate": 9.379305833351172e-07, "loss": 7.321395969484001e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 455.1875, "completions/min_length": 377.0, "epoch": 4.886764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.8321332931518555, "kl": 0.010264870710670948, "learning_rate": 9.378686401150952e-07, "loss": 0.00010303035378456116, "reward": 0.8374999761581421, "reward_std": 0.35143834352493286, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 456.5625, "completions/min_length": 378.0, "epoch": 4.8882352941176475, "frac_reward_zero_std": 0.5, "grad_norm": 1.2889716625213623, "kl": 0.00841682543978095, "learning_rate": 9.378066680493952e-07, "loss": 8.408352732658386e-05, "reward": 0.797208309173584, "reward_std": 0.13202115893363953, "rewards/DrugCombAccuracyCOTORM/mean": 0.7660416960716248, "rewards/DrugCombAccuracyCOTORM/std": 0.3253538906574249, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.22334785759449005, "step": 3324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 478.4375, "completions/min_length": 379.0, "epoch": 4.889705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.012732349336147308, "kl": 0.006576295476406813, "learning_rate": 9.377446671421e-07, "loss": 6.584723450941965e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 458.4375, "completions/min_length": 353.0, "epoch": 4.891176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.1197561025619507, "kl": 0.01270012988243252, "learning_rate": 9.376826373972939e-07, "loss": 0.00012512714602053165, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 443.5, "completions/min_length": 399.0, "epoch": 4.892647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.014248423278331757, "kl": 0.006500908872112632, "learning_rate": 9.376205788190632e-07, "loss": 6.52759918011725e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/mean_length": 518.4375, "completions/min_length": 398.0, "epoch": 4.894117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.9852743744850159, "kl": 0.007545912521891296, "learning_rate": 9.375584914114962e-07, "loss": 7.580206147395074e-05, "reward": 0.8889417052268982, "reward_std": 0.12721318006515503, "rewards/DrugCombAccuracyCOTORM/mean": 0.8650833368301392, "rewards/DrugCombAccuracyCOTORM/std": 0.259701132774353, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.06718549132347107, "step": 3328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/mean_length": 533.125, "completions/min_length": 432.0, "epoch": 4.895588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.235283374786377, "kl": 0.007061040145345032, "learning_rate": 9.374963751786833e-07, "loss": 7.0914626121521e-05, "reward": 0.7655726671218872, "reward_std": 0.3014417588710785, "rewards/DrugCombAccuracyCOTORM/mean": 0.721288800239563, "rewards/DrugCombAccuracyCOTORM/std": 0.37275785207748413, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8854166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.24973134696483612, "step": 3329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 472.875, "completions/min_length": 421.0, "epoch": 4.897058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.1263645887374878, "kl": 0.007103480165824294, "learning_rate": 9.374342301247163e-07, "loss": 7.09017040207982e-05, "reward": 0.5625, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 3330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 436.3125, "completions/min_length": 368.0, "epoch": 4.898529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.013299725018441677, "kl": 0.006748325191438198, "learning_rate": 9.373720562536891e-07, "loss": 6.695746560581028e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 452.625, "completions/min_length": 405.0, "epoch": 4.9, "frac_reward_zero_std": 0.5, "grad_norm": 2.867373466491699, "kl": 0.009209046605974436, "learning_rate": 9.373098535696978e-07, "loss": 9.215153113473207e-05, "reward": 0.7964166402816772, "reward_std": 0.22247067093849182, "rewards/DrugCombAccuracyCOTORM/mean": 0.7637500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.42547035217285156, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5013870000839233, "step": 3332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/mean_length": 564.0, "completions/min_length": 385.0, "epoch": 4.901470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.207167387008667, "kl": 0.006920757936313748, "learning_rate": 9.3724762207684e-07, "loss": 6.890296936035156e-05, "reward": 0.41892364621162415, "reward_std": 0.22654810547828674, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7048611044883728, "rewards/DrugCombCoverageCOTORM/std": 0.3831420838832855, "step": 3333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 483.9375, "completions/min_length": 389.0, "epoch": 4.902941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.02023952640593052, "kl": 0.006595214013941586, "learning_rate": 9.371853617792156e-07, "loss": 6.638386548729613e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 477.5625, "completions/min_length": 431.0, "epoch": 4.904411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9369334578514099, "kl": 0.006324547226540744, "learning_rate": 9.371230726809256e-07, "loss": 6.374344229698181e-05, "reward": 0.8040624856948853, "reward_std": 0.16225165128707886, "rewards/DrugCombAccuracyCOTORM/mean": 0.7648437023162842, "rewards/DrugCombAccuracyCOTORM/std": 0.3602319657802582, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.921875, "rewards/DrugCombCoverageCOTORM/std": 0.11967839300632477, "step": 3335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 451.75, "completions/min_length": 379.0, "epoch": 4.905882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.8008168339729309, "kl": 0.0072686042403802276, "learning_rate": 9.37060754786074e-07, "loss": 7.282379374373704e-05, "reward": 0.9479166269302368, "reward_std": 0.1473138928413391, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 3336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 467.375, "completions/min_length": 424.0, "epoch": 4.9073529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.2472947835922241, "kl": 0.007032264838926494, "learning_rate": 9.369984080987659e-07, "loss": 7.057934999465942e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 3337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 443.875, "completions/min_length": 376.0, "epoch": 4.908823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.1541653871536255, "kl": 0.012545710545964539, "learning_rate": 9.369360326231086e-07, "loss": 0.00012809807958547026, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 3338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 446.6875, "completions/min_length": 359.0, "epoch": 4.910294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.8843435645103455, "kl": 0.007310073706321418, "learning_rate": 9.368736283632111e-07, "loss": 7.31050968170166e-05, "reward": 0.5375000238418579, "reward_std": 0.051754921674728394, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 3339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/mean_length": 549.625, "completions/min_length": 489.0, "epoch": 4.911764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.8101216554641724, "kl": 0.011064768652431667, "learning_rate": 9.368111953231847e-07, "loss": 0.00011006742715835571, "reward": 0.8577708005905151, "reward_std": 0.24380245804786682, "rewards/DrugCombAccuracyCOTORM/mean": 0.8345833420753479, "rewards/DrugCombAccuracyCOTORM/std": 0.30906519293785095, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9010416865348816, "rewards/DrugCombCoverageCOTORM/std": 0.25634315609931946, "step": 3340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 475.875, "completions/min_length": 373.0, "epoch": 4.913235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9990532398223877, "kl": 0.009914168971590698, "learning_rate": 9.367487335071423e-07, "loss": 9.941309690475464e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 468.25, "completions/min_length": 402.0, "epoch": 4.9147058823529415, "frac_reward_zero_std": 0.5, "grad_norm": 1.5124949216842651, "kl": 0.006754091358743608, "learning_rate": 9.366862429191983e-07, "loss": 6.712807225994766e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 425.6875, "completions/min_length": 374.0, "epoch": 4.916176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.027440331876277924, "kl": 0.00963381165638566, "learning_rate": 9.366237235634699e-07, "loss": 9.588900866219774e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 488.5625, "completions/min_length": 427.0, "epoch": 4.91764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9964513778686523, "kl": 0.007867156295105815, "learning_rate": 9.365611754440754e-07, "loss": 7.87787139415741e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 3344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 461.0, "completions/min_length": 403.0, "epoch": 4.919117647058823, "frac_reward_zero_std": 0.0, "grad_norm": 1.527185082435608, "kl": 0.007562514510937035, "learning_rate": 9.364985985651357e-07, "loss": 7.560849189758301e-05, "reward": 0.6571249961853027, "reward_std": 0.4143086075782776, "rewards/DrugCombAccuracyCOTORM/mean": 0.5831249952316284, "rewards/DrugCombAccuracyCOTORM/std": 0.49084240198135376, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.20155644416809082, "step": 3345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/mean_length": 485.5, "completions/min_length": 379.0, "epoch": 4.920588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.1507724523544312, "kl": 0.006685872212983668, "learning_rate": 9.364359929307728e-07, "loss": 6.628179107792675e-05, "reward": 0.8999999761581421, "reward_std": 0.10690448433160782, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.22360680997371674, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 444.625, "completions/min_length": 407.0, "epoch": 4.922058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.01894035004079342, "kl": 0.00585593213327229, "learning_rate": 9.363733585451112e-07, "loss": 5.831650923937559e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 426.875, "completions/min_length": 368.0, "epoch": 4.923529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.1791942119598389, "kl": 0.009109447011724114, "learning_rate": 9.363106954122771e-07, "loss": 9.214674355462193e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 421.875, "completions/min_length": 381.0, "epoch": 4.925, "frac_reward_zero_std": 1.0, "grad_norm": 0.017062878236174583, "kl": 0.008552041254006326, "learning_rate": 9.362480035363985e-07, "loss": 8.537273242836818e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 459.3125, "completions/min_length": 399.0, "epoch": 4.926470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.839716911315918, "kl": 0.007742831250652671, "learning_rate": 9.361852829216055e-07, "loss": 7.734702376183122e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 447.5625, "completions/min_length": 408.0, "epoch": 4.927941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.010242068208754063, "kl": 0.006163964280858636, "learning_rate": 9.361225335720299e-07, "loss": 6.15168537478894e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 461.875, "completions/min_length": 397.0, "epoch": 4.929411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.7918683886528015, "kl": 0.006274472922086716, "learning_rate": 9.360597554918054e-07, "loss": 6.268918514251709e-05, "reward": 0.644058346748352, "reward_std": 0.008650270290672779, "rewards/DrugCombAccuracyCOTORM/mean": 0.5850208401679993, "rewards/DrugCombAccuracyCOTORM/std": 0.4286993443965912, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7604166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.25069350004196167, "step": 3352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/mean_length": 526.6875, "completions/min_length": 437.0, "epoch": 4.930882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.9242140054702759, "kl": 0.00729698536451906, "learning_rate": 9.359969486850678e-07, "loss": 7.333293615374714e-05, "reward": 0.8110312223434448, "reward_std": 0.12740108370780945, "rewards/DrugCombAccuracyCOTORM/mean": 0.7706249952316284, "rewards/DrugCombAccuracyCOTORM/std": 0.32014212012290955, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9453125, "rewards/DrugCombCoverageCOTORM/std": 0.06404344737529755, "step": 3353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/mean_length": 500.5625, "completions/min_length": 393.0, "epoch": 4.932352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.148699402809143, "kl": 0.00942802568897605, "learning_rate": 9.359341131559546e-07, "loss": 9.401150600751862e-05, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 478.875, "completions/min_length": 390.0, "epoch": 4.9338235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 1.079445242881775, "kl": 0.008404085878282785, "learning_rate": 9.358712489086052e-07, "loss": 8.526696910848841e-05, "reward": 0.75, "reward_std": 0.26726123690605164, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 3355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 445.0625, "completions/min_length": 378.0, "epoch": 4.935294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.029342366382479668, "kl": 0.006773806177079678, "learning_rate": 9.358083559471611e-07, "loss": 6.779540854040533e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 415.0625, "completions/min_length": 370.0, "epoch": 4.936764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.1781269311904907, "kl": 0.008746978943236172, "learning_rate": 9.357454342757651e-07, "loss": 8.66428017616272e-05, "reward": 0.6603333353996277, "reward_std": 0.03111269697546959, "rewards/DrugCombAccuracyCOTORM/mean": 0.5962499976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.4203629493713379, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 3357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 456.625, "completions/min_length": 375.0, "epoch": 4.938235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.3328256607055664, "kl": 0.007062641088850796, "learning_rate": 9.356824838985628e-07, "loss": 7.00540840625763e-05, "reward": 0.6609375476837158, "reward_std": 0.46798235177993774, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 445.375, "completions/min_length": 375.0, "epoch": 4.939705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.031708668917417526, "kl": 0.011392653454095125, "learning_rate": 9.356195048197011e-07, "loss": 0.00011291407281532884, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 436.0625, "completions/min_length": 382.0, "epoch": 4.9411764705882355, "frac_reward_zero_std": 0.5, "grad_norm": 1.0810527801513672, "kl": 0.008590975077822804, "learning_rate": 9.355564970433287e-07, "loss": 8.575381070841104e-05, "reward": 0.921625018119812, "reward_std": 0.14512228965759277, "rewards/DrugCombAccuracyCOTORM/mean": 0.9059374928474426, "rewards/DrugCombAccuracyCOTORM/std": 0.25702768564224243, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.08539126068353653, "step": 3360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 496.8125, "completions/min_length": 446.0, "epoch": 4.942647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.2184607982635498, "kl": 0.007555734948255122, "learning_rate": 9.354934605735967e-07, "loss": 7.510557043133304e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 3361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 441.375, "completions/min_length": 379.0, "epoch": 4.944117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.8271496295928955, "kl": 0.0069004223914816976, "learning_rate": 9.354303954146575e-07, "loss": 6.905943155288696e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 3362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 454.5625, "completions/min_length": 425.0, "epoch": 4.945588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.009284285828471184, "kl": 0.005675185588188469, "learning_rate": 9.353673015706658e-07, "loss": 5.6941626098705456e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 411.875, "completions/min_length": 344.0, "epoch": 4.947058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.418347954750061, "kl": 0.00793365971185267, "learning_rate": 9.353041790457781e-07, "loss": 7.880851626396179e-05, "reward": 0.7937500476837158, "reward_std": 0.36611872911453247, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 425.75, "completions/min_length": 375.0, "epoch": 4.948529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.01369976531714201, "kl": 0.007243711617775261, "learning_rate": 9.352410278441527e-07, "loss": 7.308633212232962e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 429.875, "completions/min_length": 392.0, "epoch": 4.95, "frac_reward_zero_std": 1.0, "grad_norm": 0.01567120850086212, "kl": 0.006055609439499676, "learning_rate": 9.351778479699498e-07, "loss": 6.0261056205490604e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 426.5625, "completions/min_length": 369.0, "epoch": 4.951470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.00862937979400158, "kl": 0.004715597373433411, "learning_rate": 9.351146394273316e-07, "loss": 4.7117326175794005e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 428.375, "completions/min_length": 361.0, "epoch": 4.952941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.0289431810379028, "kl": 0.006985335377976298, "learning_rate": 9.350514022204621e-07, "loss": 7.049739360809326e-05, "reward": 0.6918749809265137, "reward_std": 0.0581006184220314, "rewards/DrugCombAccuracyCOTORM/mean": 0.6343749761581421, "rewards/DrugCombAccuracyCOTORM/std": 0.38918665051460266, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.1663190722465515, "step": 3368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/mean_length": 507.0625, "completions/min_length": 436.0, "epoch": 4.954411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.0315442085266113, "kl": 0.006628206581808627, "learning_rate": 9.349881363535072e-07, "loss": 6.633251905441284e-05, "reward": 0.625, "reward_std": 0.15811389684677124, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 3369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 421.375, "completions/min_length": 356.0, "epoch": 4.955882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.7706767916679382, "kl": 0.006040470791049302, "learning_rate": 9.349248418306345e-07, "loss": 6.052851676940918e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/mean_length": 462.5, "completions/min_length": 393.0, "epoch": 4.95735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9403505325317383, "kl": 0.005920036928728223, "learning_rate": 9.348615186560141e-07, "loss": 5.926936864852905e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 475.1875, "completions/min_length": 372.0, "epoch": 4.958823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.01901250146329403, "kl": 0.007860800717025995, "learning_rate": 9.347981668338174e-07, "loss": 7.772984827170148e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/mean_length": 463.875, "completions/min_length": 347.0, "epoch": 4.9602941176470585, "frac_reward_zero_std": 0.5, "grad_norm": 0.9453984498977661, "kl": 0.007780028390698135, "learning_rate": 9.347347863682177e-07, "loss": 7.828697562217712e-05, "reward": 0.7873333692550659, "reward_std": 0.16114985942840576, "rewards/DrugCombAccuracyCOTORM/mean": 0.73416668176651, "rewards/DrugCombAccuracyCOTORM/std": 0.38874441385269165, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 439.25, "completions/min_length": 366.0, "epoch": 4.961764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.017598241567611694, "kl": 0.00841854652389884, "learning_rate": 9.346713772633905e-07, "loss": 8.452613838016987e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 456.8125, "completions/min_length": 395.0, "epoch": 4.963235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.014931432902812958, "kl": 0.006890865392051637, "learning_rate": 9.346079395235129e-07, "loss": 6.915345875313506e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 496.75, "completions/min_length": 455.0, "epoch": 4.964705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.0440760850906372, "kl": 0.007071726256981492, "learning_rate": 9.345444731527641e-07, "loss": 7.058680057525635e-05, "reward": 0.9551249742507935, "reward_std": 0.12692566215991974, "rewards/DrugCombAccuracyCOTORM/mean": 0.9478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.20874999463558197, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 3376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 420.9375, "completions/min_length": 364.0, "epoch": 4.966176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.02046424150466919, "kl": 0.006562508642673492, "learning_rate": 9.344809781553251e-07, "loss": 6.565813964698464e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/mean_length": 539.5, "completions/min_length": 443.0, "epoch": 4.9676470588235295, "frac_reward_zero_std": 0.0, "grad_norm": 1.901587963104248, "kl": 0.008330208016559482, "learning_rate": 9.344174545353789e-07, "loss": 8.316710591316223e-05, "reward": 0.6583437919616699, "reward_std": 0.40245723724365234, "rewards/DrugCombAccuracyCOTORM/mean": 0.5989062190055847, "rewards/DrugCombAccuracyCOTORM/std": 0.4542942941188812, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.792187511920929, "rewards/DrugCombCoverageCOTORM/std": 0.40121883153915405, "step": 3378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 414.5625, "completions/min_length": 368.0, "epoch": 4.969117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.1602482795715332, "kl": 0.008187537547200918, "learning_rate": 9.343539022971101e-07, "loss": 8.156007970683277e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 3379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 467.75, "completions/min_length": 437.0, "epoch": 4.970588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.019387220963835716, "kl": 0.008889199700206518, "learning_rate": 9.342903214447055e-07, "loss": 8.899203385226429e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 455.6875, "completions/min_length": 367.0, "epoch": 4.972058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.02960115484893322, "kl": 0.009089115192182362, "learning_rate": 9.342267119823535e-07, "loss": 9.10743692656979e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 427.6875, "completions/min_length": 341.0, "epoch": 4.973529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9644376635551453, "kl": 0.005682700080797076, "learning_rate": 9.341630739142446e-07, "loss": 5.729246186092496e-05, "reward": 0.574999988079071, "reward_std": 0.1752549111843109, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.25, "rewards/DrugCombCoverageCOTORM/std": 1.0, "step": 3382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 453.8125, "completions/min_length": 375.0, "epoch": 4.975, "frac_reward_zero_std": 0.5, "grad_norm": 0.9695428013801575, "kl": 0.006354016251862049, "learning_rate": 9.340994072445712e-07, "loss": 6.366521120071411e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 467.0625, "completions/min_length": 415.0, "epoch": 4.976470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9620150923728943, "kl": 0.008899880689568818, "learning_rate": 9.340357119775273e-07, "loss": 8.815526962280273e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 440.5, "completions/min_length": 415.0, "epoch": 4.977941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.1693735122680664, "kl": 0.007429170305840671, "learning_rate": 9.339719881173092e-07, "loss": 7.42646079743281e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 467.3125, "completions/min_length": 399.0, "epoch": 4.979411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.055715799331665, "kl": 0.008526700781658292, "learning_rate": 9.339082356681147e-07, "loss": 8.57491759234108e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 389.375, "completions/min_length": 301.0, "epoch": 4.980882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.0525394678115845, "kl": 0.00778754020575434, "learning_rate": 9.338444546341438e-07, "loss": 7.718060805927962e-05, "reward": 0.9551249742507935, "reward_std": 0.12692566215991974, "rewards/DrugCombAccuracyCOTORM/mean": 0.9478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.20874999463558197, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 3387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 479.4375, "completions/min_length": 407.0, "epoch": 4.982352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.7921406626701355, "kl": 0.005047831800766289, "learning_rate": 9.33780645019598e-07, "loss": 5.05298376083374e-05, "reward": 0.75, "reward_std": 0.20701967179775238, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 475.5, "completions/min_length": 414.0, "epoch": 4.983823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 1.459857702255249, "kl": 0.010604348964989185, "learning_rate": 9.337168068286812e-07, "loss": 0.00010579824447631836, "reward": 0.7294583320617676, "reward_std": 0.25519436597824097, "rewards/DrugCombAccuracyCOTORM/mean": 0.6631250381469727, "rewards/DrugCombAccuracyCOTORM/std": 0.349889874458313, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9895833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.041666675359010696, "step": 3389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 477.5, "completions/min_length": 394.0, "epoch": 4.985294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.6351327896118164, "kl": 0.009233052609488368, "learning_rate": 9.336529400655987e-07, "loss": 9.2335045337677e-05, "reward": 0.8500000238418579, "reward_std": 0.2777460217475891, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.35939764976501465, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 438.8125, "completions/min_length": 402.0, "epoch": 4.9867647058823525, "frac_reward_zero_std": 1.0, "grad_norm": 0.013733496889472008, "kl": 0.006546182674355805, "learning_rate": 9.33589044734558e-07, "loss": 6.57671334920451e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 446.0, "completions/min_length": 366.0, "epoch": 4.988235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01627822406589985, "kl": 0.006615293910726905, "learning_rate": 9.335251208397683e-07, "loss": 6.628410483244807e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 511.125, "completions/min_length": 446.0, "epoch": 4.989705882352942, "frac_reward_zero_std": 0.0, "grad_norm": 1.3607873916625977, "kl": 0.008041680790483952, "learning_rate": 9.334611683854408e-07, "loss": 8.001178503036499e-05, "reward": 0.3773333430290222, "reward_std": 0.31948405504226685, "rewards/DrugCombAccuracyCOTORM/mean": 0.2841666638851166, "rewards/DrugCombAccuracyCOTORM/std": 0.39547720551490784, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.38005849719047546, "step": 3393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 424.0, "completions/min_length": 377.0, "epoch": 4.991176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.02947392500936985, "kl": 0.009191204560920596, "learning_rate": 9.333971873757883e-07, "loss": 9.188390686176717e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/mean_length": 474.0625, "completions/min_length": 355.0, "epoch": 4.992647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.0161597728729248, "kl": 0.007949650986120105, "learning_rate": 9.33333177815026e-07, "loss": 7.865131919970736e-05, "reward": 0.6894761919975281, "reward_std": 0.1616155207157135, "rewards/DrugCombAccuracyCOTORM/mean": 0.6235640048980713, "rewards/DrugCombAccuracyCOTORM/std": 0.4741412103176117, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.18972444534301758, "step": 3395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 434.75, "completions/min_length": 381.0, "epoch": 4.9941176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.007032332476228476, "kl": 0.005026980419643223, "learning_rate": 9.332691397073707e-07, "loss": 5.00771748193074e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/mean_length": 400.6875, "completions/min_length": 358.0, "epoch": 4.995588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.011560826562345028, "kl": 0.007030696142464876, "learning_rate": 9.332050730570408e-07, "loss": 7.001031190156937e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/mean_length": 547.25, "completions/min_length": 407.0, "epoch": 4.997058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.710776925086975, "kl": 0.0072987357852980494, "learning_rate": 9.331409778682569e-07, "loss": 7.343292236328125e-05, "reward": 0.5166637301445007, "reward_std": 0.18131139874458313, "rewards/DrugCombAccuracyCOTORM/mean": 0.4303348660469055, "rewards/DrugCombAccuracyCOTORM/std": 0.415378212928772, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7239583730697632, "rewards/DrugCombCoverageCOTORM/std": 0.2630285918712616, "step": 3398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 443.3125, "completions/min_length": 356.0, "epoch": 4.998529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.008762774989008904, "kl": 0.005231339251622558, "learning_rate": 9.330768541452417e-07, "loss": 5.321407297742553e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 431.625, "completions/min_length": 344.0, "epoch": 5.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.06848397850990295, "kl": 0.008241056522820145, "learning_rate": 9.330127018922193e-07, "loss": 8.34117890917696e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 478.4375, "completions/min_length": 424.0, "epoch": 5.001470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.0838799476623535, "kl": 0.011886214255355299, "learning_rate": 9.329485211134158e-07, "loss": 0.00011783275112975389, "reward": 0.9178333282470703, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.9025000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.26642072200775146, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 3401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 465.5625, "completions/min_length": 370.0, "epoch": 5.002941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.8994112014770508, "kl": 0.0062704544980078936, "learning_rate": 9.328843118130596e-07, "loss": 6.274133920669556e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 453.5, "completions/min_length": 339.0, "epoch": 5.004411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.9751622080802917, "kl": 0.008388091577216983, "learning_rate": 9.328200739953802e-07, "loss": 8.366857946384698e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 417.5, "completions/min_length": 377.0, "epoch": 5.0058823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.010825542733073235, "kl": 0.005958392866887152, "learning_rate": 9.327558076646098e-07, "loss": 5.9140147641301155e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 3404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 407.4375, "completions/min_length": 319.0, "epoch": 5.007352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.0609650611877441, "kl": 0.00866764783859253, "learning_rate": 9.326915128249819e-07, "loss": 8.79594444995746e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 3405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 423.5, "completions/min_length": 373.0, "epoch": 5.008823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.011765358969569206, "kl": 0.006395457778126001, "learning_rate": 9.326271894807323e-07, "loss": 6.433229282265529e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 458.625, "completions/min_length": 427.0, "epoch": 5.010294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.014238355681300163, "kl": 0.008479736163280904, "learning_rate": 9.325628376360981e-07, "loss": 8.466470171697438e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 501.125, "completions/min_length": 440.0, "epoch": 5.011764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.2743529081344604, "kl": 0.007947079720906913, "learning_rate": 9.32498457295319e-07, "loss": 7.970631122589111e-05, "reward": 0.606249988079071, "reward_std": 0.36740854382514954, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 3408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/mean_length": 535.0625, "completions/min_length": 444.0, "epoch": 5.0132352941176475, "frac_reward_zero_std": 1.0, "grad_norm": 0.019510140642523766, "kl": 0.006735703092999756, "learning_rate": 9.324340484626359e-07, "loss": 6.732705514878035e-05, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.12909944355487823, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 420.1875, "completions/min_length": 321.0, "epoch": 5.014705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.007431880570948124, "kl": 0.00510916905477643, "learning_rate": 9.323696111422921e-07, "loss": 5.1193212129874155e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 479.5625, "completions/min_length": 427.0, "epoch": 5.016176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.09377920627594, "kl": 0.009945589932613075, "learning_rate": 9.323051453385325e-07, "loss": 9.882263839244843e-05, "reward": 0.9225091934204102, "reward_std": 0.08465563505887985, "rewards/DrugCombAccuracyCOTORM/mean": 0.9083448648452759, "rewards/DrugCombAccuracyCOTORM/std": 0.16663908958435059, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.07453560829162598, "step": 3411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/mean_length": 514.875, "completions/min_length": 402.0, "epoch": 5.017647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.8822717666625977, "kl": 0.007842056453227997, "learning_rate": 9.322406510556039e-07, "loss": 7.833167910575867e-05, "reward": 0.7908541560173035, "reward_std": 0.162140354514122, "rewards/DrugCombAccuracyCOTORM/mean": 0.7444270849227905, "rewards/DrugCombAccuracyCOTORM/std": 0.37752246856689453, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.953125, "rewards/DrugCombCoverageCOTORM/std": 0.10077822208404541, "step": 3412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/mean_length": 504.6875, "completions/min_length": 454.0, "epoch": 5.019117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.0797992944717407, "kl": 0.00964505854062736, "learning_rate": 9.321761282977551e-07, "loss": 9.403575677424669e-05, "reward": 0.7513868808746338, "reward_std": 0.20110559463500977, "rewards/DrugCombAccuracyCOTORM/mean": 0.716577410697937, "rewards/DrugCombAccuracyCOTORM/std": 0.4218542277812958, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.78125, "rewards/DrugCombCoverageCOTORM/std": 0.4069705307483673, "step": 3413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 434.5625, "completions/min_length": 398.0, "epoch": 5.020588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.2038358449935913, "kl": 0.009068755665794015, "learning_rate": 9.321115770692367e-07, "loss": 9.07531357370317e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 460.1875, "completions/min_length": 366.0, "epoch": 5.022058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.9146257042884827, "kl": 0.0065976319601759315, "learning_rate": 9.320469973743012e-07, "loss": 6.573716382263228e-05, "reward": 0.949999988079071, "reward_std": 0.09258200973272324, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.17078252136707306, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 464.125, "completions/min_length": 411.0, "epoch": 5.023529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.3317214250564575, "kl": 0.006137285730801523, "learning_rate": 9.319823892172027e-07, "loss": 6.167218089103699e-05, "reward": 0.75, "reward_std": 0.39218372106552124, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 449.9375, "completions/min_length": 386.0, "epoch": 5.025, "frac_reward_zero_std": 1.0, "grad_norm": 0.012456364929676056, "kl": 0.006459435680881143, "learning_rate": 9.319177526021977e-07, "loss": 6.484534969786182e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/mean_length": 503.75, "completions/min_length": 462.0, "epoch": 5.026470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9512353539466858, "kl": 0.007257302873767912, "learning_rate": 9.318530875335445e-07, "loss": 7.259845733642578e-05, "reward": 0.8233333826065063, "reward_std": 0.14813122153282166, "rewards/DrugCombAccuracyCOTORM/mean": 0.800000011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.3265986442565918, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 3418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 466.25, "completions/min_length": 402.0, "epoch": 5.027941176470589, "frac_reward_zero_std": 0.0, "grad_norm": 2.329956293106079, "kl": 0.01061669085174799, "learning_rate": 9.317883940155023e-07, "loss": 0.00010595470666885376, "reward": 0.6233125329017639, "reward_std": 0.3191485106945038, "rewards/DrugCombAccuracyCOTORM/mean": 0.5779687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.4977610111236572, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.609375, "rewards/DrugCombCoverageCOTORM/std": 0.4913311004638672, "step": 3419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 447.875, "completions/min_length": 363.0, "epoch": 5.029411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.020559702068567276, "kl": 0.007680958544369787, "learning_rate": 9.317236720523337e-07, "loss": 7.669807382626459e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 446.5, "completions/min_length": 415.0, "epoch": 5.030882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.009543354623019695, "kl": 0.006176818744279444, "learning_rate": 9.31658921648302e-07, "loss": 6.174585723783821e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 441.625, "completions/min_length": 353.0, "epoch": 5.0323529411764705, "frac_reward_zero_std": 0.0, "grad_norm": 1.580457091331482, "kl": 0.008517863345332444, "learning_rate": 9.315941428076732e-07, "loss": 8.397549390792847e-05, "reward": 0.4234375059604645, "reward_std": 0.3836613893508911, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 3422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/mean_length": 496.625, "completions/min_length": 383.0, "epoch": 5.033823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.011424927040934563, "kl": 0.006264419993385673, "learning_rate": 9.315293355347142e-07, "loss": 6.244204996619374e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 443.625, "completions/min_length": 368.0, "epoch": 5.035294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.6857510805130005, "kl": 0.013370100175961852, "learning_rate": 9.314644998336948e-07, "loss": 0.00013154000043869019, "reward": 0.6312500238418579, "reward_std": 0.4467061161994934, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 3424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 424.8125, "completions/min_length": 353.0, "epoch": 5.036764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.8215596675872803, "kl": 0.007562440354377031, "learning_rate": 9.31399635708886e-07, "loss": 7.561708480352536e-05, "reward": 0.882437527179718, "reward_std": 0.16225166618824005, "rewards/DrugCombAccuracyCOTORM/mean": 0.8589062690734863, "rewards/DrugCombAccuracyCOTORM/std": 0.30334246158599854, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.953125, "rewards/DrugCombCoverageCOTORM/std": 0.10077822208404541, "step": 3425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 484.25, "completions/min_length": 426.0, "epoch": 5.038235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 2.6792619228363037, "kl": 0.03894890518859029, "learning_rate": 9.313347431645609e-07, "loss": 0.0003844723105430603, "reward": 0.8187500238418579, "reward_std": 0.2626088559627533, "rewards/DrugCombAccuracyCOTORM/mean": 0.78125, "rewards/DrugCombAccuracyCOTORM/std": 0.3145764470100403, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/mean_length": 420.9375, "completions/min_length": 330.0, "epoch": 5.0397058823529415, "frac_reward_zero_std": 0.5, "grad_norm": 0.8932745456695557, "kl": 0.009537268429994583, "learning_rate": 9.312698222049946e-07, "loss": 9.803473949432373e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 3427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 528.6875, "completions/min_length": 461.0, "epoch": 5.041176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 1.3272957801818848, "kl": 0.009481054497882724, "learning_rate": 9.312048728344637e-07, "loss": 9.47117805480957e-05, "reward": 0.612500011920929, "reward_std": 0.3429144620895386, "rewards/DrugCombAccuracyCOTORM/mean": 0.53125, "rewards/DrugCombAccuracyCOTORM/std": 0.4989572763442993, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 3428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/mean_length": 499.4375, "completions/min_length": 394.0, "epoch": 5.04264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9041211605072021, "kl": 0.008510289946570992, "learning_rate": 9.311398950572471e-07, "loss": 8.481417899020016e-05, "reward": 0.7166666984558105, "reward_std": 0.16618981957435608, "rewards/DrugCombAccuracyCOTORM/mean": 0.6458333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.4629814922809601, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 476.5, "completions/min_length": 440.0, "epoch": 5.044117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.3822132349014282, "kl": 0.007331499946303666, "learning_rate": 9.310748888776253e-07, "loss": 7.414285937556997e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 483.125, "completions/min_length": 415.0, "epoch": 5.045588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9741250872612, "kl": 0.007811760762706399, "learning_rate": 9.310098542998808e-07, "loss": 7.797405123710632e-05, "reward": 0.7014166712760925, "reward_std": 0.12451375275850296, "rewards/DrugCombAccuracyCOTORM/mean": 0.6449999809265137, "rewards/DrugCombAccuracyCOTORM/std": 0.4190465211868286, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.17078250646591187, "step": 3431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 448.4375, "completions/min_length": 399.0, "epoch": 5.047058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.016314279288053513, "kl": 0.006737543269991875, "learning_rate": 9.309447913282977e-07, "loss": 6.769136234652251e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 447.5, "completions/min_length": 383.0, "epoch": 5.048529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.0985797718167305, "kl": 0.012608931981958449, "learning_rate": 9.308796999671624e-07, "loss": 0.0001237471296917647, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/mean_length": 405.0, "completions/min_length": 362.0, "epoch": 5.05, "frac_reward_zero_std": 1.0, "grad_norm": 0.021157676354050636, "kl": 0.007972296443767846, "learning_rate": 9.308145802207628e-07, "loss": 8.101446292130277e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 476.125, "completions/min_length": 439.0, "epoch": 5.051470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.193700909614563, "kl": 0.006438048905692995, "learning_rate": 9.307494320933891e-07, "loss": 6.406009197235107e-05, "reward": 0.637499988079071, "reward_std": 0.33736228942871094, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 426.5625, "completions/min_length": 392.0, "epoch": 5.052941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.021575793623924255, "kl": 0.007749185431748629, "learning_rate": 9.306842555893327e-07, "loss": 7.678303518332541e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/mean_length": 479.0625, "completions/min_length": 399.0, "epoch": 5.054411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 1.371937870979309, "kl": 0.007004751008935273, "learning_rate": 9.306190507128876e-07, "loss": 7.048994302749634e-05, "reward": 0.7250000238418579, "reward_std": 0.4039267897605896, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 3437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 432.4375, "completions/min_length": 366.0, "epoch": 5.055882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.013598599471151829, "kl": 0.007467329618521035, "learning_rate": 9.305538174683492e-07, "loss": 7.494485180359334e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 452.5625, "completions/min_length": 424.0, "epoch": 5.057352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.042189359664917, "kl": 0.007479323307052255, "learning_rate": 9.304885558600148e-07, "loss": 7.504969835281372e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2049.0, "completions/mean_length": 541.6875, "completions/min_length": 394.0, "epoch": 5.0588235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 0.8420180678367615, "kl": 0.005871601635590196, "learning_rate": 9.304232658921837e-07, "loss": 6.383789877872914e-05, "reward": 0.7156250476837158, "reward_std": 0.23563647270202637, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 0.9375, "rewards/DrugCombCOTFormatORM/std": 0.25, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 3440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/mean_length": 516.0625, "completions/min_length": 464.0, "epoch": 5.060294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9467038512229919, "kl": 0.006021657143719494, "learning_rate": 9.303579475691574e-07, "loss": 6.057322025299072e-05, "reward": 0.8505833745002747, "reward_std": 0.09126722812652588, "rewards/DrugCombAccuracyCOTORM/mean": 0.815833330154419, "rewards/DrugCombAccuracyCOTORM/std": 0.2410086691379547, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 3441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 466.0625, "completions/min_length": 421.0, "epoch": 5.061764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8734140396118164, "kl": 0.006006391835398972, "learning_rate": 9.302926008952384e-07, "loss": 6.0548038163688034e-05, "reward": 0.8767499923706055, "reward_std": 0.17010116577148438, "rewards/DrugCombAccuracyCOTORM/mean": 0.8537499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.31442803144454956, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.13437095284461975, "step": 3442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 415.6875, "completions/min_length": 370.0, "epoch": 5.063235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.03871047869324684, "kl": 0.008500716183334589, "learning_rate": 9.302272258747319e-07, "loss": 8.394225733354688e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 511.5, "completions/min_length": 463.0, "epoch": 5.064705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.7284300327301025, "kl": 0.006170517299324274, "learning_rate": 9.301618225119444e-07, "loss": 6.243868847377598e-05, "reward": 0.5671666264533997, "reward_std": 0.11347679048776627, "rewards/DrugCombAccuracyCOTORM/mean": 0.49541670083999634, "rewards/DrugCombAccuracyCOTORM/std": 0.3321286737918854, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7083333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.18757714331150055, "step": 3444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/mean_length": 421.4375, "completions/min_length": 363.0, "epoch": 5.0661764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.032734017819166183, "kl": 0.00817008304875344, "learning_rate": 9.300963908111847e-07, "loss": 8.171953959390521e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 465.9375, "completions/min_length": 408.0, "epoch": 5.067647058823529, "frac_reward_zero_std": 0.0, "grad_norm": 1.7526448965072632, "kl": 0.00859252642840147, "learning_rate": 9.300309307767633e-07, "loss": 8.670985698699951e-05, "reward": 0.6499999761581421, "reward_std": 0.39218369126319885, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 457.75, "completions/min_length": 413.0, "epoch": 5.069117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.0785961151123047, "kl": 0.008677693665958941, "learning_rate": 9.299654424129924e-07, "loss": 8.63984678289853e-05, "reward": 0.7312500476837158, "reward_std": 0.22350695729255676, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 3447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/mean_length": 481.4375, "completions/min_length": 413.0, "epoch": 5.070588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.00855542067438364, "kl": 0.006766156177036464, "learning_rate": 9.298999257241862e-07, "loss": 6.718867371091619e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 443.875, "completions/min_length": 377.0, "epoch": 5.072058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.01890488900244236, "kl": 0.009400335024110973, "learning_rate": 9.298343807146609e-07, "loss": 9.385126759298146e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 458.0, "completions/min_length": 409.0, "epoch": 5.073529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.026183247566223145, "kl": 0.008569554192945361, "learning_rate": 9.297688073887342e-07, "loss": 8.62789893290028e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 472.0625, "completions/min_length": 315.0, "epoch": 5.075, "frac_reward_zero_std": 0.5, "grad_norm": 1.2564702033996582, "kl": 0.012954265926964581, "learning_rate": 9.297032057507264e-07, "loss": 0.0001293867826461792, "reward": 0.637499988079071, "reward_std": 0.1505940705537796, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 448.375, "completions/min_length": 381.0, "epoch": 5.076470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.3336995840072632, "kl": 0.008382623200304806, "learning_rate": 9.296375758049586e-07, "loss": 8.443742990493774e-05, "reward": 0.8937499523162842, "reward_std": 0.3005203604698181, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 432.375, "completions/min_length": 386.0, "epoch": 5.077941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.009419173933565617, "kl": 0.006559896748512983, "learning_rate": 9.295719175557546e-07, "loss": 6.516738358186558e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 450.25, "completions/min_length": 398.0, "epoch": 5.079411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015000343322754, "kl": 0.008161334902979434, "learning_rate": 9.295062310074398e-07, "loss": 8.04513692855835e-05, "reward": 0.8500000238418579, "reward_std": 0.3265853524208069, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 447.5625, "completions/min_length": 396.0, "epoch": 5.080882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.019858554005622864, "kl": 0.00915355805773288, "learning_rate": 9.294405161643415e-07, "loss": 9.174442675430328e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 449.25, "completions/min_length": 376.0, "epoch": 5.08235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8310496807098389, "kl": 0.007346316589973867, "learning_rate": 9.293747730307888e-07, "loss": 7.349997758865356e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 3456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 416.0, "completions/min_length": 330.0, "epoch": 5.083823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.010251441970467567, "kl": 0.006230054190382361, "learning_rate": 9.293090016111125e-07, "loss": 6.222094816621393e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 3457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 475.1875, "completions/min_length": 430.0, "epoch": 5.0852941176470585, "frac_reward_zero_std": 1.0, "grad_norm": 0.014964217320084572, "kl": 0.008457369403913617, "learning_rate": 9.292432019096459e-07, "loss": 8.522458665538579e-05, "reward": 0.625333309173584, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5733333230018616, "rewards/DrugCombAccuracyCOTORM/std": 0.44065946340560913, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.3442651927471161, "step": 3458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/mean_length": 482.875, "completions/min_length": 365.0, "epoch": 5.086764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8013362288475037, "kl": 0.008455196861177683, "learning_rate": 9.291773739307232e-07, "loss": 8.550286293029785e-05, "reward": 0.6279761791229248, "reward_std": 0.1305699497461319, "rewards/DrugCombAccuracyCOTORM/mean": 0.5714285969734192, "rewards/DrugCombAccuracyCOTORM/std": 0.47809144854545593, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7083333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.6871843338012695, "step": 3459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 452.25, "completions/min_length": 388.0, "epoch": 5.088235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8165579438209534, "kl": 0.006505462690256536, "learning_rate": 9.291115176786812e-07, "loss": 6.451059016399086e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 443.5, "completions/min_length": 406.0, "epoch": 5.089705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.01416513416916132, "kl": 0.0055042566964402795, "learning_rate": 9.290456331578586e-07, "loss": 5.478772072819993e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 456.125, "completions/min_length": 386.0, "epoch": 5.091176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.013227677904069424, "kl": 0.00786229851655662, "learning_rate": 9.289797203725953e-07, "loss": 7.807696238160133e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 452.875, "completions/min_length": 407.0, "epoch": 5.0926470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.02293437346816063, "kl": 0.00577226304449141, "learning_rate": 9.289137793272337e-07, "loss": 5.780609717476182e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 453.25, "completions/min_length": 393.0, "epoch": 5.094117647058823, "frac_reward_zero_std": 0.0, "grad_norm": 1.662443995475769, "kl": 0.008473853464238346, "learning_rate": 9.288478100261176e-07, "loss": 8.52048397064209e-05, "reward": 0.5625, "reward_std": 0.425564706325531, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.7187952995300293, "step": 3464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 465.5625, "completions/min_length": 431.0, "epoch": 5.095588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9282951354980469, "kl": 0.0084908502176404, "learning_rate": 9.287818124735933e-07, "loss": 8.516013622283936e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 462.4375, "completions/min_length": 422.0, "epoch": 5.097058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 0.9489682912826538, "kl": 0.008661441970616579, "learning_rate": 9.28715786674008e-07, "loss": 8.652475662529469e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 423.0, "completions/min_length": 391.0, "epoch": 5.098529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 1.4917908906936646, "kl": 0.023015900049358606, "learning_rate": 9.28649732631712e-07, "loss": 0.0002417943615000695, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 484.5, "completions/min_length": 435.0, "epoch": 5.1, "frac_reward_zero_std": 1.0, "grad_norm": 0.027050813660025597, "kl": 0.0072357881581410766, "learning_rate": 9.285836503510562e-07, "loss": 7.242272840812802e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 461.1875, "completions/min_length": 408.0, "epoch": 5.101470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9801279902458191, "kl": 0.008361143409274518, "learning_rate": 9.28517539836394e-07, "loss": 8.31338475109078e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 438.625, "completions/min_length": 405.0, "epoch": 5.102941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.01341160573065281, "kl": 0.007095739245414734, "learning_rate": 9.284514010920808e-07, "loss": 7.085176184773445e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 432.875, "completions/min_length": 379.0, "epoch": 5.104411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9397567510604858, "kl": 0.0071842961478978395, "learning_rate": 9.283852341224736e-07, "loss": 7.240474224090576e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 489.625, "completions/min_length": 432.0, "epoch": 5.105882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.7355563640594482, "kl": 0.009165601804852486, "learning_rate": 9.283190389319313e-07, "loss": 9.041682642418891e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 450.875, "completions/min_length": 376.0, "epoch": 5.107352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.012595572508871555, "kl": 0.006361632840707898, "learning_rate": 9.282528155248147e-07, "loss": 6.429356290027499e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 457.5625, "completions/min_length": 413.0, "epoch": 5.108823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.02691611647605896, "kl": 0.009145551710389555, "learning_rate": 9.281865639054863e-07, "loss": 9.094025153899565e-05, "reward": 0.6865000128746033, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6237499713897705, "rewards/DrugCombAccuracyCOTORM/std": 0.38858935236930847, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.12909944355487823, "step": 3474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 429.375, "completions/min_length": 386.0, "epoch": 5.110294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.1400010585784912, "kl": 0.006521061062812805, "learning_rate": 9.281202840783107e-07, "loss": 6.496906280517578e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 669.0, "completions/mean_length": 533.8125, "completions/min_length": 419.0, "epoch": 5.1117647058823525, "frac_reward_zero_std": 0.5, "grad_norm": 0.8451194763183594, "kl": 0.008449174696579576, "learning_rate": 9.280539760476543e-07, "loss": 8.495151996612549e-05, "reward": 0.8780624866485596, "reward_std": 0.07934647798538208, "rewards/DrugCombAccuracyCOTORM/mean": 0.8514844179153442, "rewards/DrugCombAccuracyCOTORM/std": 0.20363155007362366, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.055901702493429184, "step": 3476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/mean_length": 506.25, "completions/min_length": 380.0, "epoch": 5.113235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8069145083427429, "kl": 0.0068354259710758924, "learning_rate": 9.279876398178851e-07, "loss": 6.897188723087311e-05, "reward": 0.5642361044883728, "reward_std": 0.04347751662135124, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6423611044883728, "rewards/DrugCombCoverageCOTORM/std": 0.6994909644126892, "step": 3477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 413.4375, "completions/min_length": 332.0, "epoch": 5.114705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.01043836772441864, "kl": 0.0060301137855276465, "learning_rate": 9.279212753933734e-07, "loss": 6.004196620779112e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/mean_length": 469.1875, "completions/min_length": 380.0, "epoch": 5.116176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9602224230766296, "kl": 0.007225858513265848, "learning_rate": 9.278548827784909e-07, "loss": 7.295183604583144e-05, "reward": 0.5874999761581421, "reward_std": 0.0353553369641304, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 456.8125, "completions/min_length": 388.0, "epoch": 5.117647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.2569130659103394, "kl": 0.010658257640898228, "learning_rate": 9.277884619776115e-07, "loss": 0.00010623782873153687, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 436.875, "completions/min_length": 386.0, "epoch": 5.1191176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.012096280232071877, "kl": 0.006619660300202668, "learning_rate": 9.277220129951109e-07, "loss": 6.641951040364802e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 426.5, "completions/min_length": 356.0, "epoch": 5.120588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.011707273311913013, "kl": 0.007031968212686479, "learning_rate": 9.276555358353665e-07, "loss": 6.916867278050631e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 447.0625, "completions/min_length": 393.0, "epoch": 5.122058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.9695274233818054, "kl": 0.009038170799612999, "learning_rate": 9.275890305027576e-07, "loss": 8.974969387054443e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 466.4375, "completions/min_length": 425.0, "epoch": 5.123529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.014537742361426353, "kl": 0.008400568389333785, "learning_rate": 9.275224970016655e-07, "loss": 8.437735959887505e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 470.6875, "completions/min_length": 418.0, "epoch": 5.125, "frac_reward_zero_std": 1.0, "grad_norm": 0.016349107027053833, "kl": 0.007207214832305908, "learning_rate": 9.274559353364733e-07, "loss": 7.252577051986009e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 480.3125, "completions/min_length": 439.0, "epoch": 5.126470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.0941474437713623, "kl": 0.007884655962698162, "learning_rate": 9.273893455115658e-07, "loss": 7.95610249042511e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 3486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/mean_length": 520.0625, "completions/min_length": 430.0, "epoch": 5.127941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.0613088607788086, "kl": 0.006907050614245236, "learning_rate": 9.273227275313296e-07, "loss": 6.897002458572388e-05, "reward": 0.48750001192092896, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.6191391944885254, "step": 3487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 463.8125, "completions/min_length": 395.0, "epoch": 5.129411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 1.6543943881988525, "kl": 0.010306930751539767, "learning_rate": 9.272560814001537e-07, "loss": 0.00010289251804351807, "reward": 0.675000011920929, "reward_std": 0.4505062699317932, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 3488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 429.5625, "completions/min_length": 364.0, "epoch": 5.1308823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.020856188610196114, "kl": 0.009272699942812324, "learning_rate": 9.271894071224284e-07, "loss": 9.305252751801163e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 435.5, "completions/min_length": 399.0, "epoch": 5.132352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.011326923966407776, "kl": 0.005580082768574357, "learning_rate": 9.27122704702546e-07, "loss": 5.551910726353526e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 429.25, "completions/min_length": 351.0, "epoch": 5.133823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.10383764654397964, "kl": 0.01735291350632906, "learning_rate": 9.270559741449009e-07, "loss": 0.00017306044173892587, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 417.1875, "completions/min_length": 341.0, "epoch": 5.135294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.04388601705431938, "kl": 0.00888902007136494, "learning_rate": 9.269892154538889e-07, "loss": 8.954670920502394e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/mean_length": 489.1875, "completions/min_length": 349.0, "epoch": 5.136764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.884818434715271, "kl": 0.010164296138100326, "learning_rate": 9.269224286339079e-07, "loss": 0.00010181445395573974, "reward": 0.9178333282470703, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.9025000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.26642072200775146, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 3493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 493.0625, "completions/min_length": 418.0, "epoch": 5.1382352941176475, "frac_reward_zero_std": 0.5, "grad_norm": 1.2057278156280518, "kl": 0.0058969666715711355, "learning_rate": 9.268556136893578e-07, "loss": 5.848705768585205e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 419.9375, "completions/min_length": 378.0, "epoch": 5.139705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.021290555596351624, "kl": 0.007998384302482009, "learning_rate": 9.267887706246399e-07, "loss": 7.957771595101804e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 458.0, "completions/min_length": 367.0, "epoch": 5.141176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.02000846527516842, "kl": 0.008753989124670625, "learning_rate": 9.267218994441579e-07, "loss": 8.549664926249534e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 461.0625, "completions/min_length": 408.0, "epoch": 5.142647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.05236769840121269, "kl": 0.007897699251770973, "learning_rate": 9.266550001523173e-07, "loss": 7.9502904554829e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/mean_length": 511.25, "completions/min_length": 437.0, "epoch": 5.144117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 1.531755805015564, "kl": 0.010273766238242388, "learning_rate": 9.265880727535247e-07, "loss": 0.0001032315194606781, "reward": 0.7454166412353516, "reward_std": 0.3920228183269501, "rewards/DrugCombAccuracyCOTORM/mean": 0.7104166746139526, "rewards/DrugCombAccuracyCOTORM/std": 0.4468289613723755, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7708333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.39849257469177246, "step": 3498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/mean_length": 506.25, "completions/min_length": 414.0, "epoch": 5.145588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.855272650718689, "kl": 0.0074356027180328965, "learning_rate": 9.265211172521897e-07, "loss": 7.400661706924438e-05, "reward": 0.8112033605575562, "reward_std": 0.11412159353494644, "rewards/DrugCombAccuracyCOTORM/mean": 0.7760484218597412, "rewards/DrugCombAccuracyCOTORM/std": 0.2992344796657562, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9036458730697632, "rewards/DrugCombCoverageCOTORM/std": 0.19526736438274384, "step": 3499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 448.625, "completions/min_length": 385.0, "epoch": 5.147058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.039381980895996094, "kl": 0.008806330501101911, "learning_rate": 9.264541336527227e-07, "loss": 8.747393439989537e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/mean_length": 564.75, "completions/min_length": 493.0, "epoch": 5.148529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 2.090750217437744, "kl": 0.01834620290901512, "learning_rate": 9.263871219595367e-07, "loss": 0.0001910179853439331, "reward": 0.31985580921173096, "reward_std": 0.20748107135295868, "rewards/DrugCombAccuracyCOTORM/mean": 0.19200725853443146, "rewards/DrugCombAccuracyCOTORM/std": 0.3067730665206909, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6625000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.5097385048866272, "step": 3501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 482.6875, "completions/min_length": 444.0, "epoch": 5.15, "frac_reward_zero_std": 0.5, "grad_norm": 1.215780258178711, "kl": 0.007391880848444998, "learning_rate": 9.26320082177046e-07, "loss": 7.317960262298584e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 469.1875, "completions/min_length": 406.0, "epoch": 5.151470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.7084648609161377, "kl": 0.0151291498914361, "learning_rate": 9.262530143096674e-07, "loss": 0.00015110895037651062, "reward": 0.5249999761581421, "reward_std": 0.4227043390274048, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 3503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 379.6875, "completions/min_length": 331.0, "epoch": 5.152941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.02773728407919407, "kl": 0.008762123878113925, "learning_rate": 9.261859183618188e-07, "loss": 8.917787636164576e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 449.6875, "completions/min_length": 376.0, "epoch": 5.154411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.008072234690189362, "kl": 0.005003268364816904, "learning_rate": 9.261187943379205e-07, "loss": 5.027988299843855e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 433.4375, "completions/min_length": 359.0, "epoch": 5.155882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.009438305161893368, "kl": 0.006487513775937259, "learning_rate": 9.260516422423943e-07, "loss": 6.449695501942188e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 485.375, "completions/min_length": 406.0, "epoch": 5.1573529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.8453525900840759, "kl": 0.006636314908973873, "learning_rate": 9.259844620796642e-07, "loss": 6.652685260633007e-05, "reward": 0.824999988079071, "reward_std": 0.24348658323287964, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 3507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 460.6875, "completions/min_length": 403.0, "epoch": 5.158823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.011551274918019772, "kl": 0.006191014312207699, "learning_rate": 9.259172538541558e-07, "loss": 6.168103573145345e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 443.25, "completions/min_length": 347.0, "epoch": 5.160294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.017184533178806305, "kl": 0.0071362428134307265, "learning_rate": 9.258500175702964e-07, "loss": 7.061245560180396e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 408.625, "completions/min_length": 333.0, "epoch": 5.161764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9994687438011169, "kl": 0.007740708882920444, "learning_rate": 9.257827532325157e-07, "loss": 7.692284270888194e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 3510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/mean_length": 482.375, "completions/min_length": 393.0, "epoch": 5.163235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9492579102516174, "kl": 0.011177288368344307, "learning_rate": 9.257154608452447e-07, "loss": 0.00011134147644042969, "reward": 0.8968750238418579, "reward_std": 0.19095037877559662, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.08539126068353653, "step": 3511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 444.8125, "completions/min_length": 404.0, "epoch": 5.1647058823529415, "frac_reward_zero_std": 1.0, "grad_norm": 0.010422857478260994, "kl": 0.006979067111387849, "learning_rate": 9.256481404129164e-07, "loss": 7.016856397967786e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 449.5625, "completions/min_length": 388.0, "epoch": 5.166176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.008110450580716133, "kl": 0.005542799946852028, "learning_rate": 9.255807919399659e-07, "loss": 5.559313649428077e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 406.25, "completions/min_length": 356.0, "epoch": 5.16764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.02074434421956539, "kl": 0.008873544866219163, "learning_rate": 9.255134154308298e-07, "loss": 8.867103315424174e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 3514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 422.875, "completions/min_length": 357.0, "epoch": 5.169117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.012526940554380417, "kl": 0.007854739669710398, "learning_rate": 9.254460108899468e-07, "loss": 7.804415508871898e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 485.8125, "completions/min_length": 439.0, "epoch": 5.170588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.6379661560058594, "kl": 0.004399993398692459, "learning_rate": 9.253785783217572e-07, "loss": 4.3645501136779785e-05, "reward": 0.949999988079071, "reward_std": 0.09258200973272324, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.17078252136707306, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/mean_length": 467.0625, "completions/min_length": 348.0, "epoch": 5.172058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 2.1557369232177734, "kl": 0.011684251483529806, "learning_rate": 9.253111177307034e-07, "loss": 0.00011970847845077515, "reward": 0.8669524192810059, "reward_std": 0.26315462589263916, "rewards/DrugCombAccuracyCOTORM/mean": 0.8441071510314941, "rewards/DrugCombAccuracyCOTORM/std": 0.29682761430740356, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 3517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 469.5625, "completions/min_length": 390.0, "epoch": 5.173529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9557008147239685, "kl": 0.008053087047301233, "learning_rate": 9.252436291212294e-07, "loss": 8.079565304797143e-05, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/mean_length": 476.0, "completions/min_length": 376.0, "epoch": 5.175, "frac_reward_zero_std": 0.5, "grad_norm": 0.9718994498252869, "kl": 0.0078073525801301, "learning_rate": 9.251761124977814e-07, "loss": 7.808208465576172e-05, "reward": 0.6917470693588257, "reward_std": 0.15858297049999237, "rewards/DrugCombAccuracyCOTORM/mean": 0.6218452453613281, "rewards/DrugCombAccuracyCOTORM/std": 0.4756428301334381, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9427083730697632, "rewards/DrugCombCoverageCOTORM/std": 0.12441994994878769, "step": 3519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/mean_length": 479.75, "completions/min_length": 380.0, "epoch": 5.176470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9408583045005798, "kl": 0.006899807485751808, "learning_rate": 9.251085678648071e-07, "loss": 6.950695387786254e-05, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 416.8125, "completions/min_length": 336.0, "epoch": 5.177941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.297814965248108, "kl": 0.006345920963212848, "learning_rate": 9.25040995226756e-07, "loss": 6.34118914604187e-05, "reward": 0.942187488079071, "reward_std": 0.16351844370365143, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/mean_length": 462.8125, "completions/min_length": 378.0, "epoch": 5.179411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.0432870388031006, "kl": 0.010147486347705126, "learning_rate": 9.249733945880799e-07, "loss": 0.00010087764530908316, "reward": 0.9178333282470703, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.9025000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.26642072200775146, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 3522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/mean_length": 487.25, "completions/min_length": 389.0, "epoch": 5.180882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.9570901393890381, "kl": 0.007555956486612558, "learning_rate": 9.24905765953232e-07, "loss": 7.520665531046689e-05, "reward": 0.9551249742507935, "reward_std": 0.12692566215991974, "rewards/DrugCombAccuracyCOTORM/mean": 0.9478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.20874999463558197, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 3523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 440.9375, "completions/min_length": 370.0, "epoch": 5.182352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.009385469369590282, "kl": 0.005983084090985358, "learning_rate": 9.248381093266677e-07, "loss": 5.958662222838029e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 432.25, "completions/min_length": 362.0, "epoch": 5.1838235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 1.0334423780441284, "kl": 0.0068790502846241, "learning_rate": 9.247704247128436e-07, "loss": 6.838887929916382e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 473.375, "completions/min_length": 437.0, "epoch": 5.185294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.5533254146575928, "kl": 0.008275232161395252, "learning_rate": 9.247027121162191e-07, "loss": 8.165836334228516e-05, "reward": 0.4375, "reward_std": 0.3942252993583679, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.6191391944885254, "step": 3526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 442.375, "completions/min_length": 377.0, "epoch": 5.186764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9840527176856995, "kl": 0.007589215994812548, "learning_rate": 9.246349715412546e-07, "loss": 7.570811430923641e-05, "reward": 0.8698333501815796, "reward_std": 0.028292685747146606, "rewards/DrugCombAccuracyCOTORM/mean": 0.8477083444595337, "rewards/DrugCombAccuracyCOTORM/std": 0.16454075276851654, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.08606630563735962, "step": 3527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 398.1875, "completions/min_length": 326.0, "epoch": 5.188235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1636534929275513, "kl": 0.005495799821801484, "learning_rate": 9.245672029924127e-07, "loss": 5.435696948552504e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 430.5625, "completions/min_length": 378.0, "epoch": 5.189705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.0453166961669922, "kl": 0.006722350139170885, "learning_rate": 9.244994064741582e-07, "loss": 6.802051211707294e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/mean_length": 544.9375, "completions/min_length": 442.0, "epoch": 5.1911764705882355, "frac_reward_zero_std": 0.5, "grad_norm": 0.96175616979599, "kl": 0.007573185604996979, "learning_rate": 9.24431581990957e-07, "loss": 7.705122698098421e-05, "reward": 0.9587500095367432, "reward_std": 0.08466436713933945, "rewards/DrugCombAccuracyCOTORM/mean": 0.956250011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.13149777054786682, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.18130187690258026, "step": 3530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 495.125, "completions/min_length": 396.0, "epoch": 5.192647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.190086841583252, "kl": 0.009507446317002177, "learning_rate": 9.24363729547277e-07, "loss": 9.543448686599731e-05, "reward": 0.8568333387374878, "reward_std": 0.1991622895002365, "rewards/DrugCombAccuracyCOTORM/mean": 0.8262500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.3764195442199707, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 3531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 462.375, "completions/min_length": 404.0, "epoch": 5.194117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.2713031768798828, "kl": 0.007356135058216751, "learning_rate": 9.242958491475888e-07, "loss": 7.377649308182299e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 445.125, "completions/min_length": 348.0, "epoch": 5.195588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.023095685988664627, "kl": 0.009851020062342286, "learning_rate": 9.242279407963635e-07, "loss": 9.800129191717133e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 491.8125, "completions/min_length": 437.0, "epoch": 5.197058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.013227070681750774, "kl": 0.005817478173412383, "learning_rate": 9.241600044980753e-07, "loss": 5.820149090141058e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 475.0625, "completions/min_length": 395.0, "epoch": 5.198529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 4.424529075622559, "kl": 0.10887700761668384, "learning_rate": 9.240920402571993e-07, "loss": 0.0010094937169924378, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 443.4375, "completions/min_length": 372.0, "epoch": 5.2, "frac_reward_zero_std": 1.0, "grad_norm": 0.008370471186935902, "kl": 0.005189362796954811, "learning_rate": 9.240240480782129e-07, "loss": 5.192066601011902e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 454.0625, "completions/min_length": 385.0, "epoch": 5.201470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9652451872825623, "kl": 0.006136608077213168, "learning_rate": 9.239560279655953e-07, "loss": 6.149318505777046e-05, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 3537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 460.375, "completions/min_length": 430.0, "epoch": 5.202941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.0679153203964233, "kl": 0.00851853471249342, "learning_rate": 9.238879799238276e-07, "loss": 8.538365364074707e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 465.625, "completions/min_length": 415.0, "epoch": 5.204411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.009415553882718086, "kl": 0.006245515076443553, "learning_rate": 9.238199039573924e-07, "loss": 6.272653990890831e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 428.25, "completions/min_length": 370.0, "epoch": 5.205882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.012837851420044899, "kl": 0.006387762608937919, "learning_rate": 9.237518000707745e-07, "loss": 6.321450200630352e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 455.4375, "completions/min_length": 392.0, "epoch": 5.20735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.04673297703266144, "kl": 0.009613678441382945, "learning_rate": 9.236836682684603e-07, "loss": 9.531201794743538e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 473.375, "completions/min_length": 385.0, "epoch": 5.208823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.020108338445425034, "kl": 0.007503350847400725, "learning_rate": 9.236155085549384e-07, "loss": 7.424838258884847e-05, "reward": 0.27133333683013916, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.10999999940395355, "rewards/DrugCombAccuracyCOTORM/std": 0.1136075109243393, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 3542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 415.625, "completions/min_length": 382.0, "epoch": 5.2102941176470585, "frac_reward_zero_std": 1.0, "grad_norm": 0.013368668034672737, "kl": 0.006615847465582192, "learning_rate": 9.235473209346988e-07, "loss": 6.631274300161749e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 432.0625, "completions/min_length": 367.0, "epoch": 5.211764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.019085848703980446, "kl": 0.008343552355654538, "learning_rate": 9.234791054122335e-07, "loss": 8.264229109045118e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 3544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 468.5, "completions/min_length": 396.0, "epoch": 5.213235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.025652628391981125, "kl": 0.007477266248315573, "learning_rate": 9.234108619920364e-07, "loss": 7.476293103536591e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 434.125, "completions/min_length": 401.0, "epoch": 5.214705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.10322873294353485, "kl": 0.007160012610256672, "learning_rate": 9.233425906786034e-07, "loss": 7.032300345599651e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 450.9375, "completions/min_length": 389.0, "epoch": 5.216176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.021865632385015488, "kl": 0.00859582715202123, "learning_rate": 9.232742914764317e-07, "loss": 8.620205335319042e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 475.875, "completions/min_length": 439.0, "epoch": 5.2176470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.8989261984825134, "kl": 0.005844002240337431, "learning_rate": 9.232059643900209e-07, "loss": 5.853707989444956e-05, "reward": 0.8708333373069763, "reward_std": 0.08177106082439423, "rewards/DrugCombAccuracyCOTORM/mean": 0.8541666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.20069323480129242, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.22360680997371674, "step": 3548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/mean_length": 485.0, "completions/min_length": 342.0, "epoch": 5.219117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.9559522271156311, "kl": 0.007606637547723949, "learning_rate": 9.231376094238722e-07, "loss": 7.681510032853112e-05, "reward": 0.6539027690887451, "reward_std": 0.02053174190223217, "rewards/DrugCombAccuracyCOTORM/mean": 0.5944122076034546, "rewards/DrugCombAccuracyCOTORM/std": 0.41963857412338257, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.783730149269104, "rewards/DrugCombCoverageCOTORM/std": 0.23735666275024414, "step": 3549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 478.5, "completions/min_length": 425.0, "epoch": 5.220588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.1090205907821655, "kl": 0.00808784959372133, "learning_rate": 9.230692265824886e-07, "loss": 8.052587509155273e-05, "reward": 0.6114761829376221, "reward_std": 0.2084292322397232, "rewards/DrugCombAccuracyCOTORM/mean": 0.6080952286720276, "rewards/DrugCombAccuracyCOTORM/std": 0.49096694588661194, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.25, "rewards/DrugCombCoverageCOTORM/std": 1.0, "step": 3550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 402.0625, "completions/min_length": 320.0, "epoch": 5.222058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 0.9477744102478027, "kl": 0.006582698319107294, "learning_rate": 9.23000815870375e-07, "loss": 6.571277481270954e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 449.5625, "completions/min_length": 394.0, "epoch": 5.223529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.017779426649212837, "kl": 0.009000514401122928, "learning_rate": 9.229323772920381e-07, "loss": 9.027175110531971e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/mean_length": 508.9375, "completions/min_length": 378.0, "epoch": 5.225, "frac_reward_zero_std": 0.5, "grad_norm": 0.7960909008979797, "kl": 0.008586674113757908, "learning_rate": 9.228639108519866e-07, "loss": 8.745118975639343e-05, "reward": 0.8162564039230347, "reward_std": 0.14556849002838135, "rewards/DrugCombAccuracyCOTORM/mean": 0.8036538362503052, "rewards/DrugCombAccuracyCOTORM/std": 0.28122881054878235, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7333333492279053, "rewards/DrugCombCoverageCOTORM/std": 0.5625010132789612, "step": 3553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 458.5625, "completions/min_length": 379.0, "epoch": 5.226470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.0338724851608276, "kl": 0.009179401909932494, "learning_rate": 9.227954165547307e-07, "loss": 9.11698880372569e-05, "reward": 0.574999988079071, "reward_std": 0.04629100486636162, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 3554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/mean_length": 569.25, "completions/min_length": 426.0, "epoch": 5.227941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.3500117063522339, "kl": 0.009436734369955957, "learning_rate": 9.227268944047828e-07, "loss": 9.541958570480347e-05, "reward": 0.5531666874885559, "reward_std": 0.42725473642349243, "rewards/DrugCombAccuracyCOTORM/mean": 0.4883333444595337, "rewards/DrugCombAccuracyCOTORM/std": 0.45921674370765686, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 3555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 443.75, "completions/min_length": 400.0, "epoch": 5.229411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.8724340200424194, "kl": 0.007503806613385677, "learning_rate": 9.226583444066568e-07, "loss": 7.395767897833139e-05, "reward": 0.9178333282470703, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.9025000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.26642072200775146, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 3556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 451.0625, "completions/min_length": 387.0, "epoch": 5.230882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.020977672189474106, "kl": 0.008413235191255808, "learning_rate": 9.225897665648688e-07, "loss": 8.350827556569129e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 3557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 477.1875, "completions/min_length": 444.0, "epoch": 5.232352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.3943754434585571, "kl": 0.007242107298225164, "learning_rate": 9.225211608839363e-07, "loss": 7.257765537360683e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/mean_length": 504.0625, "completions/min_length": 424.0, "epoch": 5.233823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.0570284128189087, "kl": 0.006618438055738807, "learning_rate": 9.22452527368379e-07, "loss": 6.595126615138724e-05, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/mean_length": 534.75, "completions/min_length": 479.0, "epoch": 5.235294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9617329239845276, "kl": 0.00777887599542737, "learning_rate": 9.223838660227182e-07, "loss": 7.7893964771647e-05, "reward": 0.6758666634559631, "reward_std": 0.1347963660955429, "rewards/DrugCombAccuracyCOTORM/mean": 0.6156666874885559, "rewards/DrugCombAccuracyCOTORM/std": 0.45358914136886597, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.20184335112571716, "step": 3560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 474.9375, "completions/min_length": 405.0, "epoch": 5.2367647058823525, "frac_reward_zero_std": 1.0, "grad_norm": 0.00824651401489973, "kl": 0.004961079568602145, "learning_rate": 9.223151768514774e-07, "loss": 4.9508162192068994e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 3561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 466.0, "completions/min_length": 430.0, "epoch": 5.238235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.7707782983779907, "kl": 0.008844809606671333, "learning_rate": 9.222464598591815e-07, "loss": 8.871406316757202e-05, "reward": 0.6499999761581421, "reward_std": 0.3265853524208069, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 436.25, "completions/min_length": 370.0, "epoch": 5.239705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.7009968757629395, "kl": 0.006189547013491392, "learning_rate": 9.221777150503573e-07, "loss": 6.152689456939697e-05, "reward": 0.6327500343322754, "reward_std": 0.023334523662924767, "rewards/DrugCombAccuracyCOTORM/mean": 0.5721874833106995, "rewards/DrugCombAccuracyCOTORM/std": 0.44363635778427124, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 3563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 434.25, "completions/min_length": 373.0, "epoch": 5.241176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.012546458281576633, "kl": 0.008067195070907474, "learning_rate": 9.221089424295337e-07, "loss": 8.065737347351387e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 3564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 459.4375, "completions/min_length": 379.0, "epoch": 5.242647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.124869704246521, "kl": 0.012925805291160941, "learning_rate": 9.220401420012411e-07, "loss": 0.00012897938722744584, "reward": 0.7229167222976685, "reward_std": 0.2202879637479782, "rewards/DrugCombAccuracyCOTORM/mean": 0.7083333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.3191423714160919, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 3565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 554.4375, "completions/min_length": 455.0, "epoch": 5.2441176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.4406309127807617, "kl": 0.009870017180219293, "learning_rate": 9.219713137700121e-07, "loss": 9.928643703460693e-05, "reward": 0.3219708502292633, "reward_std": 0.25122886896133423, "rewards/DrugCombAccuracyCOTORM/mean": 0.2252500057220459, "rewards/DrugCombAccuracyCOTORM/std": 0.32319116592407227, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4333333373069763, "rewards/DrugCombCoverageCOTORM/std": 0.43614476919174194, "step": 3566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 476.6875, "completions/min_length": 398.0, "epoch": 5.245588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.012553847394883633, "kl": 0.006956797791644931, "learning_rate": 9.219024577403806e-07, "loss": 6.952886906219646e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 712.0, "completions/mean_length": 523.8125, "completions/min_length": 430.0, "epoch": 5.247058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.094973087310791, "kl": 0.007543007261119783, "learning_rate": 9.218335739168832e-07, "loss": 7.600712706334889e-05, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 457.4375, "completions/min_length": 407.0, "epoch": 5.248529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.0137366047129035, "kl": 0.0065564283868297935, "learning_rate": 9.217646623040572e-07, "loss": 6.58140706946142e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 430.5625, "completions/min_length": 368.0, "epoch": 5.25, "frac_reward_zero_std": 1.0, "grad_norm": 0.010729774832725525, "kl": 0.007100960821844637, "learning_rate": 9.216957229064428e-07, "loss": 7.082987576723099e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 468.25, "completions/min_length": 397.0, "epoch": 5.251470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.014419875107705593, "kl": 0.007390233455225825, "learning_rate": 9.216267557285813e-07, "loss": 7.365665078395978e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 451.4375, "completions/min_length": 360.0, "epoch": 5.252941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.8281769156455994, "kl": 0.007694389903917909, "learning_rate": 9.215577607750162e-07, "loss": 7.684531738050282e-05, "reward": 0.8553333282470703, "reward_std": 0.20654159784317017, "rewards/DrugCombAccuracyCOTORM/mean": 0.8400000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.3471023142337799, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5018484592437744, "step": 3572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 448.5625, "completions/min_length": 380.0, "epoch": 5.254411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.015439724549651146, "kl": 0.006873083766549826, "learning_rate": 9.214887380502925e-07, "loss": 6.890705844853073e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 438.6875, "completions/min_length": 408.0, "epoch": 5.2558823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.7515788674354553, "kl": 0.0072757110465317965, "learning_rate": 9.214196875589575e-07, "loss": 7.238239049911499e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 428.1875, "completions/min_length": 390.0, "epoch": 5.257352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.016547873616218567, "kl": 0.006980657926760614, "learning_rate": 9.2135060930556e-07, "loss": 6.995409785304219e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 436.125, "completions/min_length": 392.0, "epoch": 5.258823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.04741842299699783, "kl": 0.008687648340128362, "learning_rate": 9.212815032946506e-07, "loss": 8.688133675605059e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 441.0, "completions/min_length": 354.0, "epoch": 5.260294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.3371847867965698, "kl": 0.006848199293017387, "learning_rate": 9.212123695307818e-07, "loss": 6.838887929916382e-05, "reward": 0.4000000059604645, "reward_std": 0.34844106435775757, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 430.6875, "completions/min_length": 371.0, "epoch": 5.261764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0722336769104004, "kl": 0.008815214154310524, "learning_rate": 9.211432080185082e-07, "loss": 8.785352110862732e-05, "reward": 0.38200002908706665, "reward_std": 0.18175861239433289, "rewards/DrugCombAccuracyCOTORM/mean": 0.28999999165534973, "rewards/DrugCombAccuracyCOTORM/std": 0.4313493072986603, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 3578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 431.875, "completions/min_length": 402.0, "epoch": 5.2632352941176475, "frac_reward_zero_std": 0.5, "grad_norm": 2.00307559967041, "kl": 0.008038884145207703, "learning_rate": 9.210740187623858e-07, "loss": 8.064764551818371e-05, "reward": 0.9177083373069763, "reward_std": 0.17764092981815338, "rewards/DrugCombAccuracyCOTORM/mean": 0.9166666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.5072392821311951, "step": 3579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 457.1875, "completions/min_length": 395.0, "epoch": 5.264705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.8285608887672424, "kl": 0.0059898883337154984, "learning_rate": 9.210048017669726e-07, "loss": 5.9936282923445106e-05, "reward": 0.625, "reward_std": 0.15352989733219147, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 3580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 418.625, "completions/min_length": 368.0, "epoch": 5.266176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.009764195419847965, "kl": 0.0068177381763234735, "learning_rate": 9.209355570368284e-07, "loss": 6.763300189049914e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 439.625, "completions/min_length": 388.0, "epoch": 5.267647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.015315551310777664, "kl": 0.009090523235499859, "learning_rate": 9.208662845765151e-07, "loss": 9.072467219084501e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/mean_length": 460.625, "completions/min_length": 406.0, "epoch": 5.269117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.0512210130691528, "kl": 0.008988489164039493, "learning_rate": 9.207969843905961e-07, "loss": 8.966028690338135e-05, "reward": 0.8926249742507935, "reward_std": 0.20235960185527802, "rewards/DrugCombAccuracyCOTORM/mean": 0.8853124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.314830482006073, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.5072392821311951, "step": 3583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 455.8125, "completions/min_length": 411.0, "epoch": 5.270588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9073923230171204, "kl": 0.007933090557344258, "learning_rate": 9.207276564836366e-07, "loss": 7.955729961395264e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 451.625, "completions/min_length": 357.0, "epoch": 5.272058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.02154809795320034, "kl": 0.008370737661607563, "learning_rate": 9.206583008602038e-07, "loss": 8.324384543811902e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 440.9375, "completions/min_length": 387.0, "epoch": 5.273529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.7543036937713623, "kl": 0.012555449036881328, "learning_rate": 9.205889175248668e-07, "loss": 0.00012599676847457886, "reward": 0.7859375476837158, "reward_std": 0.3689606785774231, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 3586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 461.625, "completions/min_length": 397.0, "epoch": 5.275, "frac_reward_zero_std": 0.5, "grad_norm": 1.0181491374969482, "kl": 0.007735521183349192, "learning_rate": 9.205195064821962e-07, "loss": 7.786229252815247e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 3587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 504.0, "completions/min_length": 459.0, "epoch": 5.276470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 3.7312917709350586, "kl": 0.01330893556587398, "learning_rate": 9.204500677367647e-07, "loss": 0.0001341700553894043, "reward": 0.6378333568572998, "reward_std": 0.18054792284965515, "rewards/DrugCombAccuracyCOTORM/mean": 0.5681250095367432, "rewards/DrugCombAccuracyCOTORM/std": 0.3993614614009857, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 3588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 436.0, "completions/min_length": 338.0, "epoch": 5.277941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 1.0122028589248657, "kl": 0.007056727888993919, "learning_rate": 9.203806012931467e-07, "loss": 6.979660975048319e-05, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 3589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/mean_length": 494.75, "completions/min_length": 396.0, "epoch": 5.279411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.012309538200497627, "kl": 0.00689740153029561, "learning_rate": 9.203111071559187e-07, "loss": 6.881354056531563e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 481.3125, "completions/min_length": 443.0, "epoch": 5.280882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.013314559124410152, "kl": 0.007249063113704324, "learning_rate": 9.202415853296585e-07, "loss": 7.269830530276522e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/mean_length": 508.1875, "completions/min_length": 401.0, "epoch": 5.2823529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.9898010492324829, "kl": 0.0067667977418750525, "learning_rate": 9.201720358189463e-07, "loss": 6.755068898200989e-05, "reward": 0.9125000238418579, "reward_std": 0.18077215552330017, "rewards/DrugCombAccuracyCOTORM/mean": 0.90625, "rewards/DrugCombAccuracyCOTORM/std": 0.2719528079032898, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/mean_length": 484.875, "completions/min_length": 417.0, "epoch": 5.283823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 1.2446980476379395, "kl": 0.0082607347285375, "learning_rate": 9.201024586283637e-07, "loss": 8.188188076019287e-05, "reward": 0.8374999761581421, "reward_std": 0.34973084926605225, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 3593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 429.25, "completions/min_length": 372.0, "epoch": 5.285294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9857633113861084, "kl": 0.01093766640406102, "learning_rate": 9.200328537624942e-07, "loss": 0.00010982780804624781, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 503.625, "completions/min_length": 456.0, "epoch": 5.286764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.8777015209197998, "kl": 0.029941622051410377, "learning_rate": 9.199632212259231e-07, "loss": 0.0003024786710739136, "reward": 0.8687499761581421, "reward_std": 0.25283604860305786, "rewards/DrugCombAccuracyCOTORM/mean": 0.84375, "rewards/DrugCombAccuracyCOTORM/std": 0.3520771861076355, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 481.0, "completions/min_length": 417.0, "epoch": 5.288235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.018518090248108, "kl": 0.008669276488944888, "learning_rate": 9.198935610232382e-07, "loss": 8.658985461806878e-05, "reward": 0.30000001192092896, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.125, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/mean_length": 479.3125, "completions/min_length": 361.0, "epoch": 5.2897058823529415, "frac_reward_zero_std": 0.5, "grad_norm": 1.0641484260559082, "kl": 0.006388380308635533, "learning_rate": 9.198238731590277e-07, "loss": 6.386158929672092e-05, "reward": 0.4937500059604645, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 3597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 460.1875, "completions/min_length": 363.0, "epoch": 5.291176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.1988791227340698, "kl": 0.0068945749662816525, "learning_rate": 9.197541576378832e-07, "loss": 6.91823661327362e-05, "reward": 0.625, "reward_std": 0.15811388194561005, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 3598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 437.8125, "completions/min_length": 384.0, "epoch": 5.29264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.00812588632106781, "kl": 0.005402620765380561, "learning_rate": 9.196844144643967e-07, "loss": 5.419849185273051e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 3599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 443.75, "completions/min_length": 391.0, "epoch": 5.294117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.0327869653701782, "kl": 0.009510914795100689, "learning_rate": 9.196146436431634e-07, "loss": 9.485302871325985e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 3600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 446.5, "completions/min_length": 383.0, "epoch": 5.295588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9571573734283447, "kl": 0.01070697489194572, "learning_rate": 9.195448451787791e-07, "loss": 0.00010684877634048462, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 669.0, "completions/mean_length": 506.25, "completions/min_length": 391.0, "epoch": 5.297058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.2406394481658936, "kl": 0.00791652116458863, "learning_rate": 9.194750190758421e-07, "loss": 7.988512516021729e-05, "reward": 0.780152440071106, "reward_std": 0.3692129850387573, "rewards/DrugCombAccuracyCOTORM/mean": 0.7349561452865601, "rewards/DrugCombAccuracyCOTORM/std": 0.44226428866386414, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/mean_length": 458.8125, "completions/min_length": 366.0, "epoch": 5.298529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.0755635499954224, "kl": 0.007870071451179683, "learning_rate": 9.194051653389526e-07, "loss": 7.923590601421893e-05, "reward": 0.810692310333252, "reward_std": 0.22291842103004456, "rewards/DrugCombAccuracyCOTORM/mean": 0.7868028879165649, "rewards/DrugCombAccuracyCOTORM/std": 0.4034682512283325, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 3603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 499.0, "completions/min_length": 443.0, "epoch": 5.3, "frac_reward_zero_std": 0.0, "grad_norm": 1.4623463153839111, "kl": 0.006167643936350942, "learning_rate": 9.19335283972712e-07, "loss": 6.143748760223389e-05, "reward": 0.7301250100135803, "reward_std": 0.36746078729629517, "rewards/DrugCombAccuracyCOTORM/mean": 0.6978124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.4644816815853119, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.71875, "rewards/DrugCombCoverageCOTORM/std": 0.44604745507240295, "step": 3604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 456.75, "completions/min_length": 346.0, "epoch": 5.301470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.00977239478379488, "kl": 0.006302300957031548, "learning_rate": 9.192653749817242e-07, "loss": 6.270177982514724e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 3605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 489.75, "completions/min_length": 404.0, "epoch": 5.302941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.0642073154449463, "kl": 0.008345226640813053, "learning_rate": 9.191954383705945e-07, "loss": 8.236512803705409e-05, "reward": 0.5625, "reward_std": 0.03857583925127983, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.6540472507476807, "step": 3606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 520.625, "completions/min_length": 438.0, "epoch": 5.304411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 1.2388925552368164, "kl": 0.007746372721157968, "learning_rate": 9.191254741439302e-07, "loss": 7.669627666473389e-05, "reward": 0.7028166651725769, "reward_std": 0.3973315954208374, "rewards/DrugCombAccuracyCOTORM/mean": 0.637374997138977, "rewards/DrugCombAccuracyCOTORM/std": 0.4857471287250519, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9291666746139526, "rewards/DrugCombCoverageCOTORM/std": 0.1529342532157898, "step": 3607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/mean_length": 495.875, "completions/min_length": 421.0, "epoch": 5.305882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.02327517420053482, "kl": 0.008949971874244511, "learning_rate": 9.190554823063403e-07, "loss": 8.885945135261863e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 427.25, "completions/min_length": 356.0, "epoch": 5.307352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.011458425782620907, "kl": 0.006383560015819967, "learning_rate": 9.189854628624356e-07, "loss": 6.363978900481015e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 447.9375, "completions/min_length": 394.0, "epoch": 5.3088235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.013342428021132946, "kl": 0.007801568717695773, "learning_rate": 9.18915415816829e-07, "loss": 7.73192587075755e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 469.9375, "completions/min_length": 412.0, "epoch": 5.310294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.01861940324306488, "kl": 0.007452592020854354, "learning_rate": 9.188453411741351e-07, "loss": 7.468442345270887e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 451.5625, "completions/min_length": 368.0, "epoch": 5.311764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9741851687431335, "kl": 0.007533486699685454, "learning_rate": 9.187752389389701e-07, "loss": 7.588416337966919e-05, "reward": 0.6213333010673523, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.5475000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.41562002897262573, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 3612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 443.0625, "completions/min_length": 386.0, "epoch": 5.313235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.016338571906089783, "kl": 0.008343964349478483, "learning_rate": 9.187051091159519e-07, "loss": 8.361288928426802e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/mean_length": 492.75, "completions/min_length": 406.0, "epoch": 5.314705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.459986686706543, "kl": 0.007545798900537193, "learning_rate": 9.18634951709701e-07, "loss": 7.369369268417358e-05, "reward": 0.7250000238418579, "reward_std": 0.195383220911026, "rewards/DrugCombAccuracyCOTORM/mean": 0.6666666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.4714045524597168, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.14907118678092957, "step": 3614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 426.3125, "completions/min_length": 337.0, "epoch": 5.3161764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.008420838974416256, "kl": 0.004973926232196391, "learning_rate": 9.185647667248387e-07, "loss": 4.988004366168752e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 426.75, "completions/min_length": 366.0, "epoch": 5.317647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.042788051068782806, "kl": 0.007404497358947992, "learning_rate": 9.184945541659888e-07, "loss": 7.527062552981079e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 444.5, "completions/min_length": 398.0, "epoch": 5.319117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.8329254388809204, "kl": 0.006945160101167858, "learning_rate": 9.18424314037777e-07, "loss": 6.86347484588623e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 409.375, "completions/min_length": 353.0, "epoch": 5.320588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.010152180679142475, "kl": 0.007683346979320049, "learning_rate": 9.1835404634483e-07, "loss": 7.673465006519109e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/mean_length": 532.1875, "completions/min_length": 405.0, "epoch": 5.322058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.4825756549835205, "kl": 0.008142002276144922, "learning_rate": 9.182837510917774e-07, "loss": 8.13901424407959e-05, "reward": 0.550000011920929, "reward_std": 0.4183264970779419, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.632455587387085, "step": 3619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 415.0, "completions/min_length": 376.0, "epoch": 5.323529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.3620927333831787, "kl": 0.011757726548239589, "learning_rate": 9.182134282832497e-07, "loss": 0.00011735680163837969, "reward": 0.3125, "reward_std": 0.24164614081382751, "rewards/DrugCombAccuracyCOTORM/mean": 0.1875, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 3620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 481.9375, "completions/min_length": 401.0, "epoch": 5.325, "frac_reward_zero_std": 0.5, "grad_norm": 1.445885419845581, "kl": 0.00977682915981859, "learning_rate": 9.181430779238797e-07, "loss": 9.776963997865096e-05, "reward": 0.8023750185966492, "reward_std": 0.13430556654930115, "rewards/DrugCombAccuracyCOTORM/mean": 0.7777083516120911, "rewards/DrugCombAccuracyCOTORM/std": 0.3221840262413025, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8020833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.24509069323539734, "step": 3621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 459.8125, "completions/min_length": 422.0, "epoch": 5.326470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.02711057849228382, "kl": 0.007237979676574469, "learning_rate": 9.18072700018302e-07, "loss": 7.283517334144562e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 472.125, "completions/min_length": 415.0, "epoch": 5.327941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.364578127861023, "kl": 0.01082951808348298, "learning_rate": 9.180022945711527e-07, "loss": 0.0001093745231628418, "reward": 0.2562500238418579, "reward_std": 0.3794882893562317, "rewards/DrugCombAccuracyCOTORM/mean": 0.1875, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0625, "rewards/DrugCombCoverageCOTORM/std": 0.7719024419784546, "step": 3623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 489.375, "completions/min_length": 445.0, "epoch": 5.329411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.8121011853218079, "kl": 0.009419225971214473, "learning_rate": 9.179318615870702e-07, "loss": 9.453445818508044e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 451.125, "completions/min_length": 401.0, "epoch": 5.330882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.016542503610253334, "kl": 0.008127045235596597, "learning_rate": 9.178614010706942e-07, "loss": 8.168041676981375e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 423.8125, "completions/min_length": 373.0, "epoch": 5.33235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.018284996971488, "kl": 0.008202858618460596, "learning_rate": 9.177909130266666e-07, "loss": 8.130892820190638e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 452.8125, "completions/min_length": 377.0, "epoch": 5.333823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 1.4011342525482178, "kl": 0.008540616137906909, "learning_rate": 9.17720397459631e-07, "loss": 8.635222911834717e-05, "reward": 0.75, "reward_std": 0.39218372106552124, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/mean_length": 368.8125, "completions/min_length": 299.0, "epoch": 5.3352941176470585, "frac_reward_zero_std": 1.0, "grad_norm": 0.010184508748352528, "kl": 0.006145821884274483, "learning_rate": 9.176498543742327e-07, "loss": 6.135575677035376e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/mean_length": 469.5625, "completions/min_length": 327.0, "epoch": 5.336764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9848655462265015, "kl": 0.0077374938409775496, "learning_rate": 9.175792837751189e-07, "loss": 7.819043821655214e-05, "reward": 0.6868541836738586, "reward_std": 0.08668142557144165, "rewards/DrugCombAccuracyCOTORM/mean": 0.6450260281562805, "rewards/DrugCombAccuracyCOTORM/std": 0.39328208565711975, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7083333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.30731815099716187, "step": 3629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 477.0625, "completions/min_length": 408.0, "epoch": 5.338235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.8085349798202515, "kl": 0.008777496870607138, "learning_rate": 9.175086856669388e-07, "loss": 8.812546730041504e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/mean_length": 497.0, "completions/min_length": 378.0, "epoch": 5.339705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.2880152463912964, "kl": 0.009515137528069317, "learning_rate": 9.174380600543429e-07, "loss": 9.538108861306682e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 442.9375, "completions/min_length": 349.0, "epoch": 5.341176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9625042080879211, "kl": 0.009164390503428876, "learning_rate": 9.173674069419841e-07, "loss": 9.194329322781414e-05, "reward": 0.7024999856948853, "reward_std": 0.14145593345165253, "rewards/DrugCombAccuracyCOTORM/mean": 0.6697916984558105, "rewards/DrugCombAccuracyCOTORM/std": 0.3860406279563904, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.6694387197494507, "step": 3632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 475.0, "completions/min_length": 387.0, "epoch": 5.3426470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.0887244939804077, "kl": 0.006596519146114588, "learning_rate": 9.172967263345166e-07, "loss": 6.595243758056313e-05, "reward": 0.7971354722976685, "reward_std": 0.02062208205461502, "rewards/DrugCombAccuracyCOTORM/mean": 0.7708333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.24247947335243225, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8046875, "rewards/DrugCombCoverageCOTORM/std": 0.49784693121910095, "step": 3633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/mean_length": 498.0625, "completions/min_length": 430.0, "epoch": 5.344117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.1665726900100708, "kl": 0.007739493623375893, "learning_rate": 9.17226018236597e-07, "loss": 7.727829506620765e-05, "reward": 0.3499999940395355, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.1875, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 476.75, "completions/min_length": 393.0, "epoch": 5.345588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.2759387493133545, "kl": 0.007929267827421427, "learning_rate": 9.171552826528831e-07, "loss": 7.911026477813721e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 3635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 434.3125, "completions/min_length": 401.0, "epoch": 5.347058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.016901928931474686, "kl": 0.00940474565140903, "learning_rate": 9.170845195880349e-07, "loss": 9.3745460617356e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 485.5625, "completions/min_length": 415.0, "epoch": 5.348529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.59073007106781, "kl": 0.011931231012567878, "learning_rate": 9.170137290467141e-07, "loss": 0.0001195073127746582, "reward": 0.47291669249534607, "reward_std": 0.42660757899284363, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7291666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.6800735592842102, "step": 3637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 428.75, "completions/min_length": 365.0, "epoch": 5.35, "frac_reward_zero_std": 0.5, "grad_norm": 1.0003306865692139, "kl": 0.00855678750667721, "learning_rate": 9.16942911033584e-07, "loss": 8.607684867456555e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 3638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/mean_length": 510.8125, "completions/min_length": 412.0, "epoch": 5.351470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.889214813709259, "kl": 0.005410283454693854, "learning_rate": 9.168720655533102e-07, "loss": 5.412084283307195e-05, "reward": 0.8272500038146973, "reward_std": 0.1693042516708374, "rewards/DrugCombAccuracyCOTORM/mean": 0.7864062190055847, "rewards/DrugCombAccuracyCOTORM/std": 0.35961204767227173, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.981249988079071, "rewards/DrugCombCoverageCOTORM/std": 0.07500000298023224, "step": 3639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/mean_length": 528.9375, "completions/min_length": 461.0, "epoch": 5.352941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.0320968627929688, "kl": 0.010372412856668234, "learning_rate": 9.168011926105597e-07, "loss": 0.0001034960150718689, "reward": 0.5776666402816772, "reward_std": 0.03981783613562584, "rewards/DrugCombAccuracyCOTORM/mean": 0.5137500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.5050000548362732, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.394405335187912, "step": 3640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/mean_length": 484.3125, "completions/min_length": 373.0, "epoch": 5.354411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.2299507856369019, "kl": 0.010352820856496692, "learning_rate": 9.167302922100013e-07, "loss": 0.00010456889867782593, "reward": 0.9900000095367432, "reward_std": 0.02828424982726574, "rewards/DrugCombAccuracyCOTORM/mean": 0.987500011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.05000000074505806, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 442.5625, "completions/min_length": 380.0, "epoch": 5.355882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 1.468277096748352, "kl": 0.009032599744386971, "learning_rate": 9.166593643563061e-07, "loss": 9.040534496307373e-05, "reward": 0.6875, "reward_std": 0.42211851477622986, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 3642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 452.625, "completions/min_length": 406.0, "epoch": 5.357352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.008735444396734238, "kl": 0.005904572200961411, "learning_rate": 9.165884090541463e-07, "loss": 5.934113869443536e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 451.375, "completions/min_length": 405.0, "epoch": 5.358823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.5994853973388672, "kl": 0.006740601966157556, "learning_rate": 9.165174263081963e-07, "loss": 6.756274524377659e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 3644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 440.625, "completions/min_length": 394.0, "epoch": 5.360294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.01442236453294754, "kl": 0.006945243803784251, "learning_rate": 9.164464161231324e-07, "loss": 6.946896610315889e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 465.0, "completions/min_length": 415.0, "epoch": 5.3617647058823525, "frac_reward_zero_std": 1.0, "grad_norm": 0.0123688243329525, "kl": 0.008586148731410503, "learning_rate": 9.163753785036323e-07, "loss": 8.60936997924e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 3646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 729.0, "completions/mean_length": 534.3125, "completions/min_length": 450.0, "epoch": 5.363235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.3737324476242065, "kl": 0.008618851425126195, "learning_rate": 9.163043134543762e-07, "loss": 8.638203144073486e-05, "reward": 0.8234410285949707, "reward_std": 0.2826617956161499, "rewards/DrugCombAccuracyCOTORM/mean": 0.800134539604187, "rewards/DrugCombAccuracyCOTORM/std": 0.30623969435691833, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.4962824583053589, "step": 3647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 479.5, "completions/min_length": 431.0, "epoch": 5.364705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.09447181224823, "kl": 0.01568634749855846, "learning_rate": 9.162332209800454e-07, "loss": 0.00015234430611599237, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 408.125, "completions/min_length": 368.0, "epoch": 5.366176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.011102789081633091, "kl": 0.007694051135331392, "learning_rate": 9.161621010853233e-07, "loss": 7.685866148676723e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 431.0, "completions/min_length": 365.0, "epoch": 5.367647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.976285457611084, "kl": 0.00794957100879401, "learning_rate": 9.160909537748953e-07, "loss": 7.918104529380798e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 447.875, "completions/min_length": 392.0, "epoch": 5.3691176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.985437273979187, "kl": 0.008015824249014258, "learning_rate": 9.160197790534482e-07, "loss": 8.067488670349121e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 3651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 463.3125, "completions/min_length": 391.0, "epoch": 5.370588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.01195085234940052, "kl": 0.006802927702665329, "learning_rate": 9.159485769256709e-07, "loss": 6.811173807363957e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 457.1875, "completions/min_length": 400.0, "epoch": 5.372058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.8982107639312744, "kl": 0.007678937166929245, "learning_rate": 9.158773473962537e-07, "loss": 7.634368375875056e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 456.0, "completions/min_length": 377.0, "epoch": 5.373529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.02430819161236286, "kl": 0.009216330014169216, "learning_rate": 9.158060904698897e-07, "loss": 9.313647024100646e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 480.1875, "completions/min_length": 423.0, "epoch": 5.375, "frac_reward_zero_std": 0.0, "grad_norm": 1.3746068477630615, "kl": 0.007278060424141586, "learning_rate": 9.157348061512726e-07, "loss": 7.261335849761963e-05, "reward": 0.36250001192092896, "reward_std": 0.3934735357761383, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 448.4375, "completions/min_length": 382.0, "epoch": 5.376470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.018577221781015396, "kl": 0.006805971614085138, "learning_rate": 9.156634944450985e-07, "loss": 6.806802412029356e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/mean_length": 420.4375, "completions/min_length": 357.0, "epoch": 5.377941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.02504599466919899, "kl": 0.0077401959570124745, "learning_rate": 9.155921553560653e-07, "loss": 7.789953815517947e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 438.0625, "completions/min_length": 385.0, "epoch": 5.379411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9795949459075928, "kl": 0.00865489395800978, "learning_rate": 9.155207888888728e-07, "loss": 8.711963891983032e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/mean_length": 474.5, "completions/min_length": 376.0, "epoch": 5.3808823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.2057735919952393, "kl": 0.010219997959211469, "learning_rate": 9.154493950482221e-07, "loss": 0.00010132789611816406, "reward": 0.9666666984558105, "reward_std": 0.061721328645944595, "rewards/DrugCombAccuracyCOTORM/mean": 0.9583333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.11385500431060791, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 452.8125, "completions/min_length": 417.0, "epoch": 5.382352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.9101736545562744, "kl": 0.007249500253237784, "learning_rate": 9.153779738388167e-07, "loss": 7.295953400898725e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 458.6875, "completions/min_length": 378.0, "epoch": 5.383823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 1.2710378170013428, "kl": 0.007449473603628576, "learning_rate": 9.153065252653616e-07, "loss": 7.455423474311829e-05, "reward": 0.3687500059604645, "reward_std": 0.28465205430984497, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.1875, "rewards/DrugCombCoverageCOTORM/std": 0.9105859398841858, "step": 3661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 505.625, "completions/min_length": 429.0, "epoch": 5.385294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9259838461875916, "kl": 0.009125172742642462, "learning_rate": 9.152350493325636e-07, "loss": 9.058695286512375e-05, "reward": 0.1745000034570694, "reward_std": 0.030552063137292862, "rewards/DrugCombAccuracyCOTORM/mean": 0.06187500059604645, "rewards/DrugCombAccuracyCOTORM/std": 0.08250000327825546, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.25, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 3662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 414.25, "completions/min_length": 346.0, "epoch": 5.386764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.01589399203658104, "kl": 0.006260655354708433, "learning_rate": 9.151635460451312e-07, "loss": 6.221294461283833e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 433.9375, "completions/min_length": 370.0, "epoch": 5.3882352941176475, "frac_reward_zero_std": 0.5, "grad_norm": 0.9237493276596069, "kl": 0.00831441255286336, "learning_rate": 9.150920154077753e-07, "loss": 8.2997496065218e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 495.8125, "completions/min_length": 425.0, "epoch": 5.389705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.8237970471382141, "kl": 0.006053956691175699, "learning_rate": 9.150204574252078e-07, "loss": 6.065145134925842e-05, "reward": 0.7794166803359985, "reward_std": 0.18720921874046326, "rewards/DrugCombAccuracyCOTORM/mean": 0.7425000071525574, "rewards/DrugCombAccuracyCOTORM/std": 0.39771851897239685, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.2713136672973633, "step": 3665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 418.375, "completions/min_length": 392.0, "epoch": 5.391176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.023930124938488007, "kl": 0.009954529581591487, "learning_rate": 9.149488721021428e-07, "loss": 0.00010131644376087934, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 460.5625, "completions/min_length": 376.0, "epoch": 5.392647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.1795670986175537, "kl": 0.009856030461378396, "learning_rate": 9.148772594432963e-07, "loss": 9.937584400177002e-05, "reward": 0.987500011920929, "reward_std": 0.0353553406894207, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 441.75, "completions/min_length": 333.0, "epoch": 5.394117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.8122435212135315, "kl": 0.008509664796292782, "learning_rate": 9.148056194533858e-07, "loss": 8.472720946883783e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 470.5, "completions/min_length": 407.0, "epoch": 5.395588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.0311557054519653, "kl": 0.009988684905692935, "learning_rate": 9.147339521371309e-07, "loss": 9.93683934211731e-05, "reward": 0.942187488079071, "reward_std": 0.16351844370365143, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 462.0625, "completions/min_length": 386.0, "epoch": 5.397058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.008857767097651958, "kl": 0.006596442428417504, "learning_rate": 9.146622574992526e-07, "loss": 6.594082515221089e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 457.0, "completions/min_length": 387.0, "epoch": 5.398529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.01660029962658882, "kl": 0.008551869541406631, "learning_rate": 9.145905355444744e-07, "loss": 8.583140152040869e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 465.0, "completions/min_length": 384.0, "epoch": 5.4, "frac_reward_zero_std": 0.5, "grad_norm": 0.8423398733139038, "kl": 0.00836604309733957, "learning_rate": 9.145187862775208e-07, "loss": 8.422881364822388e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/mean_length": 526.375, "completions/min_length": 421.0, "epoch": 5.401470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.3613486289978027, "kl": 0.010844277683645487, "learning_rate": 9.144470097031186e-07, "loss": 0.00010782480239868164, "reward": 0.5246666669845581, "reward_std": 0.3199697434902191, "rewards/DrugCombAccuracyCOTORM/mean": 0.41624999046325684, "rewards/DrugCombAccuracyCOTORM/std": 0.4050000011920929, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.08606630563735962, "step": 3673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 474.9375, "completions/min_length": 377.0, "epoch": 5.402941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.0230233334004879, "kl": 0.006362060084939003, "learning_rate": 9.143752058259963e-07, "loss": 6.275901978369802e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 445.5625, "completions/min_length": 362.0, "epoch": 5.404411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.01378644723445177, "kl": 0.006811137427575886, "learning_rate": 9.143033746508839e-07, "loss": 6.776046939194202e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/mean_length": 466.0, "completions/min_length": 398.0, "epoch": 5.405882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.6141242980957031, "kl": 0.007595258881337941, "learning_rate": 9.142315161825138e-07, "loss": 7.54992215661332e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 405.8125, "completions/min_length": 305.0, "epoch": 5.4073529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.01691814325749874, "kl": 0.006677372381091118, "learning_rate": 9.141596304256197e-07, "loss": 6.613928417209536e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/mean_length": 512.8125, "completions/min_length": 446.0, "epoch": 5.408823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.012823558412492275, "kl": 0.0075127758318558335, "learning_rate": 9.140877173849373e-07, "loss": 7.514906610595062e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 429.5625, "completions/min_length": 384.0, "epoch": 5.410294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.953081488609314, "kl": 0.008731076959520578, "learning_rate": 9.14015777065204e-07, "loss": 8.537620306015015e-05, "reward": 0.6324374675750732, "reward_std": 0.17485880851745605, "rewards/DrugCombAccuracyCOTORM/mean": 0.6089062690734863, "rewards/DrugCombAccuracyCOTORM/std": 0.46558207273483276, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.453125, "rewards/DrugCombCoverageCOTORM/std": 0.8718693852424622, "step": 3679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 471.0, "completions/min_length": 388.0, "epoch": 5.411764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8107783794403076, "kl": 0.007581930374726653, "learning_rate": 9.139438094711589e-07, "loss": 7.620453834533691e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 489.125, "completions/min_length": 422.0, "epoch": 5.413235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.169888973236084, "kl": 0.006815183092840016, "learning_rate": 9.138718146075433e-07, "loss": 6.839632987976074e-05, "reward": 0.7756249904632568, "reward_std": 0.3519848585128784, "rewards/DrugCombAccuracyCOTORM/mean": 0.739062488079071, "rewards/DrugCombAccuracyCOTORM/std": 0.3997258245944977, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.23935678601264954, "step": 3681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 475.625, "completions/min_length": 409.0, "epoch": 5.4147058823529415, "frac_reward_zero_std": 0.5, "grad_norm": 1.0290178060531616, "kl": 0.00945158104877919, "learning_rate": 9.137997924790999e-07, "loss": 9.433329978492111e-05, "reward": 0.6000000238418579, "reward_std": 0.16256865859031677, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.632455587387085, "step": 3682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 422.9375, "completions/min_length": 364.0, "epoch": 5.416176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.013178651221096516, "kl": 0.007126613985747099, "learning_rate": 9.137277430905733e-07, "loss": 7.092572923284024e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/mean_length": 402.4375, "completions/min_length": 359.0, "epoch": 5.41764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.020059097558259964, "kl": 0.007173580466769636, "learning_rate": 9.136556664467101e-07, "loss": 7.142810500226915e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 443.1875, "completions/min_length": 398.0, "epoch": 5.419117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.01111631840467453, "kl": 0.007346906349994242, "learning_rate": 9.135835625522584e-07, "loss": 7.397706212941557e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 441.8125, "completions/min_length": 390.0, "epoch": 5.420588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.02362520806491375, "kl": 0.008512568078003824, "learning_rate": 9.135114314119683e-07, "loss": 8.28830961836502e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 426.6875, "completions/min_length": 365.0, "epoch": 5.422058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.4060909748077393, "kl": 0.012209525564685464, "learning_rate": 9.134392730305915e-07, "loss": 0.00012331083416938782, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 459.8125, "completions/min_length": 311.0, "epoch": 5.423529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.386685848236084, "kl": 0.008208940271288157, "learning_rate": 9.133670874128817e-07, "loss": 8.143484592437744e-05, "reward": 0.8937499523162842, "reward_std": 0.3005203604698181, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/mean_length": 497.3125, "completions/min_length": 436.0, "epoch": 5.425, "frac_reward_zero_std": 0.5, "grad_norm": 0.8486396670341492, "kl": 0.010210331995040178, "learning_rate": 9.132948745635942e-07, "loss": 0.00010204315185546875, "reward": 0.5898749828338623, "reward_std": 0.17392152547836304, "rewards/DrugCombAccuracyCOTORM/mean": 0.5693749785423279, "rewards/DrugCombAccuracyCOTORM/std": 0.5049814581871033, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.34375, "rewards/DrugCombCoverageCOTORM/std": 0.9437293410301208, "step": 3689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/mean_length": 510.4375, "completions/min_length": 430.0, "epoch": 5.426470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.3738107681274414, "kl": 0.008924102177843451, "learning_rate": 9.132226344874865e-07, "loss": 8.90493392944336e-05, "reward": 0.4960141181945801, "reward_std": 0.2627158463001251, "rewards/DrugCombAccuracyCOTORM/mean": 0.42548641562461853, "rewards/DrugCombAccuracyCOTORM/std": 0.46455657482147217, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5562499761581421, "rewards/DrugCombCoverageCOTORM/std": 0.5316249132156372, "step": 3690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 446.5625, "completions/min_length": 385.0, "epoch": 5.427941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.8840134739875793, "kl": 0.0090605498990044, "learning_rate": 9.131503671893172e-07, "loss": 8.937716484069824e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 427.1875, "completions/min_length": 363.0, "epoch": 5.429411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.00724074337631464, "kl": 0.005890044732950628, "learning_rate": 9.130780726738473e-07, "loss": 5.901061376789585e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 388.125, "completions/min_length": 325.0, "epoch": 5.430882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.012735345400869846, "kl": 0.008424009312875569, "learning_rate": 9.130057509458393e-07, "loss": 8.516167872585356e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 772.0, "completions/mean_length": 570.6875, "completions/min_length": 416.0, "epoch": 5.432352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.1608073711395264, "kl": 0.011247055139392614, "learning_rate": 9.129334020100577e-07, "loss": 0.00011578202247619629, "reward": 0.22690972685813904, "reward_std": 0.1573946624994278, "rewards/DrugCombAccuracyCOTORM/mean": 0.0625, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7847222089767456, "rewards/DrugCombCoverageCOTORM/std": 0.272826224565506, "step": 3694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 442.25, "completions/min_length": 363.0, "epoch": 5.4338235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 1.0909768342971802, "kl": 0.010220875614322722, "learning_rate": 9.128610258712685e-07, "loss": 0.0001026540994644165, "reward": 0.893750011920929, "reward_std": 0.1971900761127472, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 438.4375, "completions/min_length": 373.0, "epoch": 5.435294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.02711688168346882, "kl": 0.007329226471483707, "learning_rate": 9.127886225342399e-07, "loss": 7.54111388232559e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 475.6875, "completions/min_length": 424.0, "epoch": 5.436764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.7725270390510559, "kl": 0.007087599486112595, "learning_rate": 9.127161920037413e-07, "loss": 7.065333193168044e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 476.125, "completions/min_length": 418.0, "epoch": 5.438235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.015694141387939453, "kl": 0.01066630776040256, "learning_rate": 9.126437342845444e-07, "loss": 0.00010471288987901062, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 434.625, "completions/min_length": 402.0, "epoch": 5.439705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.023697277531027794, "kl": 0.007414502324536443, "learning_rate": 9.125712493814227e-07, "loss": 7.391968392767012e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 463.875, "completions/min_length": 392.0, "epoch": 5.4411764705882355, "frac_reward_zero_std": 0.5, "grad_norm": 0.9135987758636475, "kl": 0.007797100697644055, "learning_rate": 9.124987372991511e-07, "loss": 7.853657007217407e-05, "reward": 0.9937499761581421, "reward_std": 0.017677659168839455, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 417.875, "completions/min_length": 360.0, "epoch": 5.442647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.01653258688747883, "kl": 0.008847984368912876, "learning_rate": 9.124261980425067e-07, "loss": 8.866270945873111e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 444.3125, "completions/min_length": 353.0, "epoch": 5.444117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.0349247455596924, "kl": 0.0063798208720982075, "learning_rate": 9.123536316162679e-07, "loss": 6.50138536002487e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 461.625, "completions/min_length": 406.0, "epoch": 5.445588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.2242743968963623, "kl": 0.01190422591753304, "learning_rate": 9.122810380252155e-07, "loss": 0.0001177374433609657, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 482.8125, "completions/min_length": 409.0, "epoch": 5.447058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.3049408197402954, "kl": 0.007720425142906606, "learning_rate": 9.122084172741317e-07, "loss": 7.697194814682007e-05, "reward": 0.7104166746139526, "reward_std": 0.3360440135002136, "rewards/DrugCombAccuracyCOTORM/mean": 0.7083333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.3824869990348816, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.7719024419784546, "step": 3704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 441.375, "completions/min_length": 381.0, "epoch": 5.448529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9759613871574402, "kl": 0.006453910376876593, "learning_rate": 9.121357693678004e-07, "loss": 6.462925375672057e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 3705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 415.1875, "completions/min_length": 343.0, "epoch": 5.45, "frac_reward_zero_std": 0.5, "grad_norm": 1.581024408340454, "kl": 0.007754784426651895, "learning_rate": 9.120630943110077e-07, "loss": 7.71326303947717e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 489.1875, "completions/min_length": 437.0, "epoch": 5.451470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.041068024933338165, "kl": 0.011070922017097473, "learning_rate": 9.119903921085412e-07, "loss": 0.00011038410593755543, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 429.25, "completions/min_length": 361.0, "epoch": 5.452941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.1066895723342896, "kl": 0.010160978185012937, "learning_rate": 9.119176627651903e-07, "loss": 0.00010128319263458252, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 438.1875, "completions/min_length": 394.0, "epoch": 5.454411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.018536826595664024, "kl": 0.007218679529614747, "learning_rate": 9.118449062857461e-07, "loss": 7.191067561507225e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/mean_length": 497.875, "completions/min_length": 430.0, "epoch": 5.455882352941177, "frac_reward_zero_std": 0.0, "grad_norm": 1.619001865386963, "kl": 0.011437055887654424, "learning_rate": 9.117721226750018e-07, "loss": 0.00011344254016876221, "reward": 0.32637500762939453, "reward_std": 0.22212225198745728, "rewards/DrugCombAccuracyCOTORM/mean": 0.22697916626930237, "rewards/DrugCombAccuracyCOTORM/std": 0.31778645515441895, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4479166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.3786855936050415, "step": 3710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 503.1875, "completions/min_length": 427.0, "epoch": 5.45735294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.2164041996002197, "kl": 0.006540944683365524, "learning_rate": 9.116993119377521e-07, "loss": 6.540119647979736e-05, "reward": 0.848312497138977, "reward_std": 0.3191484808921814, "rewards/DrugCombAccuracyCOTORM/mean": 0.8279687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.3735184371471405, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.859375, "rewards/DrugCombCoverageCOTORM/std": 0.341183602809906, "step": 3711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 411.25, "completions/min_length": 312.0, "epoch": 5.458823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.023110195994377136, "kl": 0.006781217874959111, "learning_rate": 9.116264740787935e-07, "loss": 6.772082269890234e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 462.125, "completions/min_length": 397.0, "epoch": 5.4602941176470585, "frac_reward_zero_std": 1.0, "grad_norm": 0.14207009971141815, "kl": 0.011524255736730993, "learning_rate": 9.115536091029247e-07, "loss": 0.00011423461546655744, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 423.4375, "completions/min_length": 309.0, "epoch": 5.461764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.03768789768219, "kl": 0.008276627282612026, "learning_rate": 9.114807170149455e-07, "loss": 8.193925896193832e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 3714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 494.875, "completions/min_length": 455.0, "epoch": 5.463235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.015451233834028244, "kl": 0.007200126652605832, "learning_rate": 9.114077978196579e-07, "loss": 7.19846721040085e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 441.375, "completions/min_length": 395.0, "epoch": 5.464705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.01038418523967266, "kl": 0.008324422989971936, "learning_rate": 9.11334851521866e-07, "loss": 8.256787987193093e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 454.4375, "completions/min_length": 404.0, "epoch": 5.466176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 1.3413913249969482, "kl": 0.009178887237794697, "learning_rate": 9.112618781263748e-07, "loss": 9.174644947052002e-05, "reward": 0.8051249980926514, "reward_std": 0.3339453339576721, "rewards/DrugCombAccuracyCOTORM/mean": 0.7603124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.43035051226615906, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 3717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 466.4375, "completions/min_length": 400.0, "epoch": 5.4676470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.9223693013191223, "kl": 0.00888874928932637, "learning_rate": 9.111888776379919e-07, "loss": 8.930377953220159e-05, "reward": 0.7352083325386047, "reward_std": 0.08815589547157288, "rewards/DrugCombAccuracyCOTORM/mean": 0.6937500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.35352590680122375, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8020833730697632, "rewards/DrugCombCoverageCOTORM/std": 0.21273136138916016, "step": 3718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 444.875, "completions/min_length": 420.0, "epoch": 5.469117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.01132879126816988, "kl": 0.0055303063709288836, "learning_rate": 9.111158500615265e-07, "loss": 5.5430915381293744e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 423.0, "completions/min_length": 350.0, "epoch": 5.470588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.018389519304037094, "kl": 0.008400482358410954, "learning_rate": 9.110427954017891e-07, "loss": 8.410002919845283e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 483.5625, "completions/min_length": 442.0, "epoch": 5.472058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.016467228531837463, "kl": 0.008536144043318927, "learning_rate": 9.109697136635926e-07, "loss": 8.51569784572348e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 464.875, "completions/min_length": 370.0, "epoch": 5.473529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.017749056220054626, "kl": 0.008347162394784391, "learning_rate": 9.108966048517515e-07, "loss": 8.361663640243933e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 472.125, "completions/min_length": 414.0, "epoch": 5.475, "frac_reward_zero_std": 0.5, "grad_norm": 0.9410569667816162, "kl": 0.00721998221706599, "learning_rate": 9.108234689710819e-07, "loss": 7.196515798568726e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 474.0, "completions/min_length": 406.0, "epoch": 5.476470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.2849714756011963, "kl": 0.01055831671692431, "learning_rate": 9.107503060264017e-07, "loss": 0.00010505318641662598, "reward": 0.5324166417121887, "reward_std": 0.1707897186279297, "rewards/DrugCombAccuracyCOTORM/mean": 0.4650000035762787, "rewards/DrugCombAccuracyCOTORM/std": 0.49242258071899414, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6041666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.6465721726417542, "step": 3724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 487.625, "completions/min_length": 432.0, "epoch": 5.477941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.027598261833191, "kl": 0.0065588210709393024, "learning_rate": 9.106771160225309e-07, "loss": 6.545823998749256e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 408.0625, "completions/min_length": 366.0, "epoch": 5.479411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.1796046495437622, "kl": 0.007882614620029926, "learning_rate": 9.106038989642909e-07, "loss": 7.820028258720413e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 496.9375, "completions/min_length": 418.0, "epoch": 5.480882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.9159601926803589, "kl": 0.007940543117001653, "learning_rate": 9.105306548565052e-07, "loss": 7.94529914855957e-05, "reward": 0.6472374796867371, "reward_std": 0.06799083948135376, "rewards/DrugCombAccuracyCOTORM/mean": 0.5837864875793457, "rewards/DrugCombAccuracyCOTORM/std": 0.4436182677745819, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8020833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.2274557501077652, "step": 3727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/mean_length": 488.375, "completions/min_length": 424.0, "epoch": 5.482352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.019627578556537628, "kl": 0.008699361584149301, "learning_rate": 9.10457383703999e-07, "loss": 8.652971155242994e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 465.0, "completions/min_length": 406.0, "epoch": 5.483823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.025455165654420853, "kl": 0.009312472539022565, "learning_rate": 9.10384085511599e-07, "loss": 9.296035568695515e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 393.25, "completions/min_length": 350.0, "epoch": 5.485294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.013517209328711033, "kl": 0.006784495315514505, "learning_rate": 9.10310760284134e-07, "loss": 6.760391261195764e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 473.125, "completions/min_length": 429.0, "epoch": 5.4867647058823525, "frac_reward_zero_std": 0.5, "grad_norm": 0.8200803995132446, "kl": 0.009358686860650778, "learning_rate": 9.102374080264345e-07, "loss": 9.338142990600318e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 455.375, "completions/min_length": 384.0, "epoch": 5.488235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8443600535392761, "kl": 0.008912254241295159, "learning_rate": 9.101640287433328e-07, "loss": 8.851682650856674e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 459.125, "completions/min_length": 419.0, "epoch": 5.489705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.012868698686361313, "kl": 0.005732181831263006, "learning_rate": 9.100906224396628e-07, "loss": 5.735272861784324e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/mean_length": 416.6875, "completions/min_length": 383.0, "epoch": 5.491176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.009555019438266754, "kl": 0.006298448191955686, "learning_rate": 9.100171891202604e-07, "loss": 6.27108383923769e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 449.375, "completions/min_length": 417.0, "epoch": 5.492647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.9660850763320923, "kl": 0.010387015994638205, "learning_rate": 9.099437287899631e-07, "loss": 0.0001039355993270874, "reward": 0.737500011920929, "reward_std": 0.219983771443367, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 444.375, "completions/min_length": 389.0, "epoch": 5.4941176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.025998611003160477, "kl": 0.008559313719160855, "learning_rate": 9.098702414536106e-07, "loss": 8.560076093999669e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/mean_length": 366.125, "completions/min_length": 288.0, "epoch": 5.495588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.01762152463197708, "kl": 0.007621241384185851, "learning_rate": 9.097967271160436e-07, "loss": 7.570574234705418e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 454.9375, "completions/min_length": 399.0, "epoch": 5.497058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.0259171761572361, "kl": 0.01039486238732934, "learning_rate": 9.097231857821055e-07, "loss": 0.00010299873247276992, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 3738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 438.8125, "completions/min_length": 350.0, "epoch": 5.498529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.364880919456482, "kl": 0.009111683699302375, "learning_rate": 9.096496174566407e-07, "loss": 9.147077798843384e-05, "reward": 0.9833333492279053, "reward_std": 0.047140445560216904, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 422.0, "completions/min_length": 369.0, "epoch": 5.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.00929255224764347, "kl": 0.006223820964805782, "learning_rate": 9.095760221444959e-07, "loss": 6.228344864211977e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 456.375, "completions/min_length": 398.0, "epoch": 5.501470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.9365602135658264, "kl": 0.008593358797952533, "learning_rate": 9.095023998505192e-07, "loss": 8.592457743361592e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 454.125, "completions/min_length": 401.0, "epoch": 5.502941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.012399043887853622, "kl": 0.009856067947112024, "learning_rate": 9.094287505795606e-07, "loss": 9.884657629299909e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 425.8125, "completions/min_length": 364.0, "epoch": 5.504411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.016746077686548233, "kl": 0.007149291108362377, "learning_rate": 9.093550743364722e-07, "loss": 7.139065564842895e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 436.5, "completions/min_length": 376.0, "epoch": 5.5058823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.01211787573993206, "kl": 0.007184694521129131, "learning_rate": 9.092813711261073e-07, "loss": 7.258696132339537e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 3744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/mean_length": 542.875, "completions/min_length": 475.0, "epoch": 5.507352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.5350632667541504, "kl": 0.01055559713859111, "learning_rate": 9.092076409533217e-07, "loss": 0.00010517239570617676, "reward": 0.8608125448226929, "reward_std": 0.29600298404693604, "rewards/DrugCombAccuracyCOTORM/mean": 0.8279687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.3735184371471405, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.0625, "step": 3745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/mean_length": 510.25, "completions/min_length": 435.0, "epoch": 5.508823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 1.5789121389389038, "kl": 0.007976561668328941, "learning_rate": 9.091338838229721e-07, "loss": 7.969141006469727e-05, "reward": 0.7875000238418579, "reward_std": 0.3934735357761383, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 3746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 433.9375, "completions/min_length": 387.0, "epoch": 5.510294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.018846139311790466, "kl": 0.007875089882872999, "learning_rate": 9.090600997399177e-07, "loss": 7.878726319177076e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 473.875, "completions/min_length": 388.0, "epoch": 5.511764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9902281165122986, "kl": 0.011335995281115174, "learning_rate": 9.08986288709019e-07, "loss": 0.00011357844778103754, "reward": 0.960812509059906, "reward_std": 0.11083899438381195, "rewards/DrugCombAccuracyCOTORM/mean": 0.9529687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.18812499940395355, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.0625, "step": 3748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 494.4375, "completions/min_length": 396.0, "epoch": 5.5132352941176475, "frac_reward_zero_std": 0.5, "grad_norm": 1.2048736810684204, "kl": 0.010592504870146513, "learning_rate": 9.089124507351388e-07, "loss": 0.00010610227764118463, "reward": 0.48750001192092896, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.6191391944885254, "step": 3749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 473.0, "completions/min_length": 407.0, "epoch": 5.514705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.1415846347808838, "kl": 0.009342303848825395, "learning_rate": 9.088385858231411e-07, "loss": 9.27075743675232e-05, "reward": 0.8531041741371155, "reward_std": 0.014790322631597519, "rewards/DrugCombAccuracyCOTORM/mean": 0.8254948258399963, "rewards/DrugCombAccuracyCOTORM/std": 0.1827559769153595, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9270833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.08539126813411713, "step": 3750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 472.4375, "completions/min_length": 421.0, "epoch": 5.516176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.009070143103599548, "kl": 0.006756980321370065, "learning_rate": 9.087646939778919e-07, "loss": 6.771214248146862e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 428.0625, "completions/min_length": 383.0, "epoch": 5.517647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.012612045742571354, "kl": 0.006183713325299323, "learning_rate": 9.086907752042592e-07, "loss": 6.178465264383703e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/mean_length": 538.625, "completions/min_length": 450.0, "epoch": 5.519117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.054641589522361755, "kl": 0.011306499131023884, "learning_rate": 9.086168295071126e-07, "loss": 0.00011319851910229772, "reward": 0.800000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 450.3125, "completions/min_length": 417.0, "epoch": 5.520588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.7298375368118286, "kl": 0.00864748703315854, "learning_rate": 9.085428568913231e-07, "loss": 8.643046021461487e-05, "reward": 0.7091250419616699, "reward_std": 0.18166959285736084, "rewards/DrugCombAccuracyCOTORM/mean": 0.6559374928474426, "rewards/DrugCombAccuracyCOTORM/std": 0.4617077708244324, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.23935678601264954, "step": 3754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 464.375, "completions/min_length": 407.0, "epoch": 5.522058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.186193585395813, "kl": 0.007764186826534569, "learning_rate": 9.084688573617642e-07, "loss": 7.742066372884437e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 453.5, "completions/min_length": 404.0, "epoch": 5.523529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.013129574246704578, "kl": 0.007990526501089334, "learning_rate": 9.08394830923311e-07, "loss": 7.963424286572263e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 441.75, "completions/min_length": 391.0, "epoch": 5.525, "frac_reward_zero_std": 0.5, "grad_norm": 0.9313709139823914, "kl": 0.00965762382838875, "learning_rate": 9.083207775808394e-07, "loss": 9.713249164633453e-05, "reward": 0.5625, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 3757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 409.375, "completions/min_length": 353.0, "epoch": 5.526470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.052741553634405136, "kl": 0.009528524708002806, "learning_rate": 9.082466973392285e-07, "loss": 9.538410813547671e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 450.375, "completions/min_length": 388.0, "epoch": 5.527941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.026467692106962204, "kl": 0.01040169270709157, "learning_rate": 9.081725902033584e-07, "loss": 0.00010477867908775806, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 417.1875, "completions/min_length": 372.0, "epoch": 5.529411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.013353492133319378, "kl": 0.0070240573841147125, "learning_rate": 9.080984561781109e-07, "loss": 7.027011452009901e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 441.1875, "completions/min_length": 407.0, "epoch": 5.530882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.8773356080055237, "kl": 0.007418537396006286, "learning_rate": 9.080242952683699e-07, "loss": 7.366762292804196e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 461.1875, "completions/min_length": 412.0, "epoch": 5.5323529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.1506749391555786, "kl": 0.00944262626580894, "learning_rate": 9.079501074790208e-07, "loss": 9.420754213351756e-05, "reward": 0.8708333373069763, "reward_std": 0.2032962292432785, "rewards/DrugCombAccuracyCOTORM/mean": 0.8541666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.3435921370983124, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 3762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 450.9375, "completions/min_length": 387.0, "epoch": 5.533823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.011379660107195377, "kl": 0.00804731622338295, "learning_rate": 9.078758928149511e-07, "loss": 8.05100571596995e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 471.6875, "completions/min_length": 402.0, "epoch": 5.535294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.021639689803123474, "kl": 0.009741890011355281, "learning_rate": 9.078016512810498e-07, "loss": 9.667269478086382e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 471.5, "completions/min_length": 391.0, "epoch": 5.536764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9835339188575745, "kl": 0.008831158629618585, "learning_rate": 9.077273828822077e-07, "loss": 8.773412264417857e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 492.3125, "completions/min_length": 412.0, "epoch": 5.538235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.010554029606282711, "kl": 0.006852096877992153, "learning_rate": 9.076530876233174e-07, "loss": 6.86994826537557e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 474.375, "completions/min_length": 416.0, "epoch": 5.5397058823529415, "frac_reward_zero_std": 1.0, "grad_norm": 0.00919431447982788, "kl": 0.006099978578276932, "learning_rate": 9.075787655092735e-07, "loss": 6.078841397538781e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 476.875, "completions/min_length": 414.0, "epoch": 5.541176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.03554144874215126, "kl": 0.008537981309928, "learning_rate": 9.075044165449719e-07, "loss": 8.611514931544662e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 456.1875, "completions/min_length": 410.0, "epoch": 5.54264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0420351028442383, "kl": 0.008647850947454572, "learning_rate": 9.074300407353106e-07, "loss": 8.664744382258505e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 467.125, "completions/min_length": 411.0, "epoch": 5.544117647058823, "frac_reward_zero_std": 0.0, "grad_norm": 1.8851512670516968, "kl": 0.011602858314290643, "learning_rate": 9.073556380851892e-07, "loss": 0.00011562556028366089, "reward": 0.5445833206176758, "reward_std": 0.37712085247039795, "rewards/DrugCombAccuracyCOTORM/mean": 0.4437500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.4539732038974762, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.15957117080688477, "step": 3770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 440.5625, "completions/min_length": 366.0, "epoch": 5.545588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9699930548667908, "kl": 0.012422443833202124, "learning_rate": 9.072812085995094e-07, "loss": 0.00012286007404327393, "reward": 0.9291666746139526, "reward_std": 0.07572401314973831, "rewards/DrugCombAccuracyCOTORM/mean": 0.9166666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.14907118678092957, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.07453560829162598, "step": 3771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 500.625, "completions/min_length": 444.0, "epoch": 5.547058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.2084269523620605, "kl": 0.012304781470447779, "learning_rate": 9.072067522831741e-07, "loss": 0.00012355364742688835, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 431.1875, "completions/min_length": 393.0, "epoch": 5.548529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.019033916294574738, "kl": 0.009515785612165928, "learning_rate": 9.071322691410887e-07, "loss": 9.418785339221358e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 460.5625, "completions/min_length": 428.0, "epoch": 5.55, "frac_reward_zero_std": 0.5, "grad_norm": 0.8602107167243958, "kl": 0.006876481231302023, "learning_rate": 9.070577591781597e-07, "loss": 6.865560135338455e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/mean_length": 531.5625, "completions/min_length": 482.0, "epoch": 5.551470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.011706001125276089, "kl": 0.00615792244207114, "learning_rate": 9.069832223992954e-07, "loss": 6.16256074863486e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 424.8125, "completions/min_length": 368.0, "epoch": 5.552941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.011388727463781834, "kl": 0.009008385008201003, "learning_rate": 9.069086588094066e-07, "loss": 8.914948557503521e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 510.125, "completions/min_length": 442.0, "epoch": 5.554411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 1.6091465950012207, "kl": 0.014489947585389018, "learning_rate": 9.068340684134051e-07, "loss": 0.0001441463828086853, "reward": 0.4151666760444641, "reward_std": 0.3764796555042267, "rewards/DrugCombAccuracyCOTORM/mean": 0.32625001668930054, "rewards/DrugCombAccuracyCOTORM/std": 0.43553608655929565, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5416666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.6652763485908508, "step": 3777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 423.375, "completions/min_length": 344.0, "epoch": 5.555882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.8677144050598145, "kl": 0.00978579930961132, "learning_rate": 9.067594512162047e-07, "loss": 9.761005640029907e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 3778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 454.625, "completions/min_length": 367.0, "epoch": 5.557352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.021912116557359695, "kl": 0.0078642088919878, "learning_rate": 9.066848072227209e-07, "loss": 7.752746023470536e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/mean_length": 488.25, "completions/min_length": 418.0, "epoch": 5.5588235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.008548258803784847, "kl": 0.006647795555181801, "learning_rate": 9.066101364378713e-07, "loss": 6.64690014673397e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 460.5, "completions/min_length": 379.0, "epoch": 5.560294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.857348918914795, "kl": 0.01508043590001762, "learning_rate": 9.065354388665749e-07, "loss": 0.00015035271644592285, "reward": 0.7333333492279053, "reward_std": 0.31440168619155884, "rewards/DrugCombAccuracyCOTORM/mean": 0.7291666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.4425306022167206, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 3781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 465.625, "completions/min_length": 376.0, "epoch": 5.561764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734851837158203, "kl": 0.009547037072479725, "learning_rate": 9.064607145137526e-07, "loss": 9.621679782867432e-05, "reward": 0.7749999761581421, "reward_std": 0.41661906242370605, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 3782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 419.3125, "completions/min_length": 354.0, "epoch": 5.563235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.028499411419034004, "kl": 0.012905379524454474, "learning_rate": 9.063859633843271e-07, "loss": 0.00012796305236406624, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 452.25, "completions/min_length": 399.0, "epoch": 5.564705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.3061226606369019, "kl": 0.00910588656552136, "learning_rate": 9.063111854832228e-07, "loss": 9.072474495042115e-05, "reward": 0.824999988079071, "reward_std": 0.24348656833171844, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 3784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 469.25, "completions/min_length": 380.0, "epoch": 5.5661764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.01684590056538582, "kl": 0.008003119728527963, "learning_rate": 9.062363808153658e-07, "loss": 8.174261165549979e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 432.1875, "completions/min_length": 387.0, "epoch": 5.567647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.0438168048858643, "kl": 0.009447751333937049, "learning_rate": 9.06161549385684e-07, "loss": 9.375065565109253e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 474.9375, "completions/min_length": 393.0, "epoch": 5.569117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 1.4790793657302856, "kl": 0.010341502376832068, "learning_rate": 9.060866911991073e-07, "loss": 0.00010500103235244751, "reward": 0.5587083101272583, "reward_std": 0.38828545808792114, "rewards/DrugCombAccuracyCOTORM/mean": 0.49916666746139526, "rewards/DrugCombAccuracyCOTORM/std": 0.42803168296813965, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.59375, "rewards/DrugCombCoverageCOTORM/std": 0.5339189171791077, "step": 3787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 428.625, "completions/min_length": 376.0, "epoch": 5.570588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.8174168467521667, "kl": 0.008062954409979284, "learning_rate": 9.060118062605671e-07, "loss": 8.168451313395053e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 429.375, "completions/min_length": 387.0, "epoch": 5.572058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.01116360817104578, "kl": 0.0075403559021651745, "learning_rate": 9.059368945749964e-07, "loss": 7.54592110752128e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 499.0, "completions/min_length": 427.0, "epoch": 5.573529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9433894753456116, "kl": 0.009626233135350049, "learning_rate": 9.058619561473306e-07, "loss": 9.579956531524658e-05, "reward": 0.8602499961853027, "reward_std": 0.1932879090309143, "rewards/DrugCombAccuracyCOTORM/mean": 0.8331249952316284, "rewards/DrugCombAccuracyCOTORM/std": 0.3604528605937958, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.17078252136707306, "step": 3790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 430.6875, "completions/min_length": 375.0, "epoch": 5.575, "frac_reward_zero_std": 0.5, "grad_norm": 1.1295181512832642, "kl": 0.008434523944742978, "learning_rate": 9.057869909825061e-07, "loss": 8.594617247581482e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 438.9375, "completions/min_length": 386.0, "epoch": 5.576470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.5887080430984497, "kl": 0.011813639546744525, "learning_rate": 9.057119990854616e-07, "loss": 0.00011835247278213501, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/mean_length": 495.75, "completions/min_length": 376.0, "epoch": 5.577941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.1790634393692017, "kl": 0.01200593018438667, "learning_rate": 9.056369804611374e-07, "loss": 0.00011887782602570951, "reward": 0.7603333592414856, "reward_std": 0.11367514729499817, "rewards/DrugCombAccuracyCOTORM/mean": 0.7138094902038574, "rewards/DrugCombAccuracyCOTORM/std": 0.34328120946884155, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8928571343421936, "rewards/DrugCombCoverageCOTORM/std": 0.1916629672050476, "step": 3793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 456.5, "completions/min_length": 403.0, "epoch": 5.579411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.012870177626609802, "kl": 0.007590281427837908, "learning_rate": 9.055619351144753e-07, "loss": 7.499789353460073e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 426.8125, "completions/min_length": 344.0, "epoch": 5.580882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.27072644233703613, "kl": 0.01469710236415267, "learning_rate": 9.054868630504194e-07, "loss": 0.00014532962813973427, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/mean_length": 463.5, "completions/min_length": 372.0, "epoch": 5.58235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.7303978204727173, "kl": 0.010186620056629181, "learning_rate": 9.054117642739149e-07, "loss": 0.00010160756937693805, "reward": 0.9881159663200378, "reward_std": 0.033613171428442, "rewards/DrugCombAccuracyCOTORM/mean": 0.9851449131965637, "rewards/DrugCombAccuracyCOTORM/std": 0.05942028760910034, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 399.8125, "completions/min_length": 341.0, "epoch": 5.583823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.6691378355026245, "kl": 0.008769407286308706, "learning_rate": 9.053366387899097e-07, "loss": 8.793860615696758e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 447.5, "completions/min_length": 406.0, "epoch": 5.5852941176470585, "frac_reward_zero_std": 0.5, "grad_norm": 1.1262747049331665, "kl": 0.007301698788069189, "learning_rate": 9.052614866033524e-07, "loss": 7.323567842831835e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 463.8125, "completions/min_length": 394.0, "epoch": 5.586764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.01794562302529812, "kl": 0.007855897187255323, "learning_rate": 9.051863077191937e-07, "loss": 7.887271931394935e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 417.1875, "completions/min_length": 355.0, "epoch": 5.588235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9713076949119568, "kl": 0.009148315992206335, "learning_rate": 9.051111021423867e-07, "loss": 9.248033165931702e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 3800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 462.5625, "completions/min_length": 381.0, "epoch": 5.589705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9367458820343018, "kl": 0.00808526948094368, "learning_rate": 9.050358698778854e-07, "loss": 8.088842150755227e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 482.1875, "completions/min_length": 399.0, "epoch": 5.591176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.0144068002700806, "kl": 0.007247768924571574, "learning_rate": 9.04960610930646e-07, "loss": 7.281266152858734e-05, "reward": 0.9011499881744385, "reward_std": 0.1853467971086502, "rewards/DrugCombAccuracyCOTORM/mean": 0.887374997138977, "rewards/DrugCombAccuracyCOTORM/std": 0.3098659813404083, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9125000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.26299554109573364, "step": 3802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 472.5, "completions/min_length": 408.0, "epoch": 5.5926470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.1482068300247192, "kl": 0.009802451822906733, "learning_rate": 9.048853253056265e-07, "loss": 9.956210851669312e-05, "reward": 0.4312500059604645, "reward_std": 0.2069118171930313, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 3803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 450.6875, "completions/min_length": 371.0, "epoch": 5.594117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.1468446254730225, "kl": 0.010335671016946435, "learning_rate": 9.048100130077863e-07, "loss": 0.0001026770769385621, "reward": 0.889033317565918, "reward_std": 0.17064107954502106, "rewards/DrugCombAccuracyCOTORM/mean": 0.8706666827201843, "rewards/DrugCombAccuracyCOTORM/std": 0.30003613233566284, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.925000011920929, "rewards/DrugCombCoverageCOTORM/std": 0.20493900775909424, "step": 3804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 459.75, "completions/min_length": 371.0, "epoch": 5.595588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.007602964527904987, "kl": 0.006882593035697937, "learning_rate": 9.047346740420869e-07, "loss": 6.934139673830941e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 425.0, "completions/min_length": 381.0, "epoch": 5.597058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 1.0473957061767578, "kl": 0.008221525931730866, "learning_rate": 9.046593084134915e-07, "loss": 8.306256495416164e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 437.1875, "completions/min_length": 398.0, "epoch": 5.598529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.019928811118006706, "kl": 0.012473972979933023, "learning_rate": 9.045839161269648e-07, "loss": 0.0001239102566614747, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 458.0, "completions/min_length": 427.0, "epoch": 5.6, "frac_reward_zero_std": 1.0, "grad_norm": 0.01131298579275608, "kl": 0.00824235298205167, "learning_rate": 9.045084971874737e-07, "loss": 8.231564424932003e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 442.375, "completions/min_length": 397.0, "epoch": 5.601470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8776848912239075, "kl": 0.008531493018381298, "learning_rate": 9.044330515999864e-07, "loss": 8.577853441238403e-05, "reward": 0.3812500238418579, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3125, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 3809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 448.625, "completions/min_length": 379.0, "epoch": 5.602941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.1513234376907349, "kl": 0.013585623120889068, "learning_rate": 9.043575793694732e-07, "loss": 0.00013486022362485528, "reward": 0.9178333282470703, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.9025000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.26642072200775146, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 3810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 456.6875, "completions/min_length": 388.0, "epoch": 5.604411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.0356916189193726, "kl": 0.00862260558642447, "learning_rate": 9.042820805009059e-07, "loss": 8.83326429175213e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 468.3125, "completions/min_length": 392.0, "epoch": 5.605882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.009866137057542801, "kl": 0.006425215397030115, "learning_rate": 9.042065549992583e-07, "loss": 6.45075851934962e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 490.9375, "completions/min_length": 408.0, "epoch": 5.607352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 4865.00146484375, "kl": 74.8603235706687, "learning_rate": 9.041310028695059e-07, "loss": 0.6138269305229187, "reward": 0.7827083468437195, "reward_std": 0.1950436234474182, "rewards/DrugCombAccuracyCOTORM/mean": 0.7604166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.2719528079032898, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7437499761581421, "rewards/DrugCombCoverageCOTORM/std": 0.48849257826805115, "step": 3813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 487.75, "completions/min_length": 391.0, "epoch": 5.608823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.0694752931594849, "kl": 0.008776329457759857, "learning_rate": 9.040554241166256e-07, "loss": 8.755442104302347e-05, "reward": 0.8500000238418579, "reward_std": 0.2070196568965912, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 438.8125, "completions/min_length": 388.0, "epoch": 5.610294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9417126774787903, "kl": 0.009798071347177029, "learning_rate": 9.039798187455965e-07, "loss": 9.778141975402832e-05, "reward": 0.45000001788139343, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 3815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 494.3125, "completions/min_length": 417.0, "epoch": 5.6117647058823525, "frac_reward_zero_std": 0.0, "grad_norm": 1.4337912797927856, "kl": 0.010005760588683188, "learning_rate": 9.039041867613994e-07, "loss": 0.00010023266077041626, "reward": 0.70333331823349, "reward_std": 0.2870243787765503, "rewards/DrugCombAccuracyCOTORM/mean": 0.6499999761581421, "rewards/DrugCombAccuracyCOTORM/std": 0.35590261220932007, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.3651483952999115, "step": 3816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/mean_length": 474.375, "completions/min_length": 384.0, "epoch": 5.613235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8013965487480164, "kl": 0.008955822791904211, "learning_rate": 9.038285281690166e-07, "loss": 8.867409633239731e-05, "reward": 0.5976190567016602, "reward_std": 0.12768656015396118, "rewards/DrugCombAccuracyCOTORM/mean": 0.5595238208770752, "rewards/DrugCombAccuracyCOTORM/std": 0.4842180609703064, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.730296790599823, "step": 3817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 408.0625, "completions/min_length": 329.0, "epoch": 5.614705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.194390892982483, "kl": 0.0077081352937966585, "learning_rate": 9.037528429734322e-07, "loss": 7.703500159550458e-05, "reward": 0.9812500476837158, "reward_std": 0.025877445936203003, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 3818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 437.1875, "completions/min_length": 393.0, "epoch": 5.616176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.026319535449147224, "kl": 0.006768615450710058, "learning_rate": 9.036771311796323e-07, "loss": 6.810384365962818e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 370.9375, "completions/min_length": 330.0, "epoch": 5.617647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.2845730781555176, "kl": 0.007039415766485035, "learning_rate": 9.036013927926047e-07, "loss": 7.070085848681629e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 3820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 466.0625, "completions/min_length": 430.0, "epoch": 5.6191176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.014054015278816223, "kl": 0.007146633928641677, "learning_rate": 9.035256278173386e-07, "loss": 7.203770655905828e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 466.0625, "completions/min_length": 376.0, "epoch": 5.620588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9532190561294556, "kl": 0.008163572056218982, "learning_rate": 9.034498362588255e-07, "loss": 8.132225048029795e-05, "reward": 0.6199374794960022, "reward_std": 0.07532911002635956, "rewards/DrugCombAccuracyCOTORM/mean": 0.5496614575386047, "rewards/DrugCombAccuracyCOTORM/std": 0.3615826666355133, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8020833730697632, "rewards/DrugCombCoverageCOTORM/std": 0.15176430344581604, "step": 3822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 445.125, "completions/min_length": 412.0, "epoch": 5.622058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.013708910904824734, "kl": 0.008313692407682538, "learning_rate": 9.033740181220582e-07, "loss": 8.308772521559149e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 446.0625, "completions/min_length": 366.0, "epoch": 5.623529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.2356181144714355, "kl": 0.010988121386617422, "learning_rate": 9.032981734120312e-07, "loss": 0.00010963813838316128, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 433.25, "completions/min_length": 368.0, "epoch": 5.625, "frac_reward_zero_std": 1.0, "grad_norm": 0.010029492899775505, "kl": 0.005987414624541998, "learning_rate": 9.032223021337413e-07, "loss": 5.972401413600892e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 426.3125, "completions/min_length": 391.0, "epoch": 5.626470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.01435153093189001, "kl": 0.008894544676877558, "learning_rate": 9.031464042921865e-07, "loss": 8.902048284653574e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 456.75, "completions/min_length": 398.0, "epoch": 5.627941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.0976970195770264, "kl": 0.00715911143925041, "learning_rate": 9.030704798923667e-07, "loss": 7.162302790675312e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/mean_length": 509.6875, "completions/min_length": 449.0, "epoch": 5.629411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9081037640571594, "kl": 0.008747623185627162, "learning_rate": 9.02994528939284e-07, "loss": 8.706003427505493e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 451.125, "completions/min_length": 379.0, "epoch": 5.6308823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.0334126949310303, "kl": 0.008940834319218993, "learning_rate": 9.029185514379414e-07, "loss": 8.915229409467429e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/mean_length": 554.0625, "completions/min_length": 455.0, "epoch": 5.632352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.7770002484321594, "kl": 0.008332419209182262, "learning_rate": 9.028425473933443e-07, "loss": 8.375197649002075e-05, "reward": 0.8220842480659485, "reward_std": 0.07235074788331985, "rewards/DrugCombAccuracyCOTORM/mean": 0.7776052951812744, "rewards/DrugCombAccuracyCOTORM/std": 0.2608148455619812, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/mean_length": 471.0, "completions/min_length": 382.0, "epoch": 5.633823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.931697428226471, "kl": 0.012376537779346108, "learning_rate": 9.027665168104997e-07, "loss": 0.00012587780656758696, "reward": 0.6552083492279053, "reward_std": 0.19785243272781372, "rewards/DrugCombAccuracyCOTORM/mean": 0.6354166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.46435439586639404, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.46875, "rewards/DrugCombCoverageCOTORM/std": 0.8844725489616394, "step": 3831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 441.625, "completions/min_length": 400.0, "epoch": 5.635294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.012034809216856956, "kl": 0.006594956154003739, "learning_rate": 9.026904596944163e-07, "loss": 6.604597729165107e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 435.8125, "completions/min_length": 388.0, "epoch": 5.636764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.011965807527303696, "kl": 0.0076784235425293446, "learning_rate": 9.026143760501043e-07, "loss": 7.6632721174974e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 437.75, "completions/min_length": 391.0, "epoch": 5.6382352941176475, "frac_reward_zero_std": 0.5, "grad_norm": 0.7891809940338135, "kl": 0.008229867904447019, "learning_rate": 9.025382658825763e-07, "loss": 8.268654346466064e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 448.375, "completions/min_length": 385.0, "epoch": 5.639705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.013425912708044052, "kl": 0.00816464051604271, "learning_rate": 9.024621291968459e-07, "loss": 8.168595377355814e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 454.625, "completions/min_length": 391.0, "epoch": 5.641176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.013899853453040123, "kl": 0.007923000259324908, "learning_rate": 9.023859659979292e-07, "loss": 7.905371603555977e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/mean_length": 412.4375, "completions/min_length": 362.0, "epoch": 5.642647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.096403956413269, "kl": 0.007750215125270188, "learning_rate": 9.023097762908432e-07, "loss": 7.734447717666626e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 449.9375, "completions/min_length": 395.0, "epoch": 5.644117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.01840970292687416, "kl": 0.009741476038470864, "learning_rate": 9.022335600806071e-07, "loss": 9.734161722008139e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 467.3125, "completions/min_length": 374.0, "epoch": 5.645588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.8659326434135437, "kl": 0.009186794748529792, "learning_rate": 9.021573173722421e-07, "loss": 9.187858086079359e-05, "reward": 0.746874988079071, "reward_std": 0.18049411475658417, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4425306022167206, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 3839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 489.75, "completions/min_length": 405.0, "epoch": 5.647058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.2053886651992798, "kl": 0.01049591670744121, "learning_rate": 9.020810481707708e-07, "loss": 0.00010571836901362985, "reward": 0.7505208253860474, "reward_std": 0.12945318222045898, "rewards/DrugCombAccuracyCOTORM/mean": 0.6979166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.3859512209892273, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.921875, "rewards/DrugCombCoverageCOTORM/std": 0.11967839300632477, "step": 3840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 465.125, "completions/min_length": 383.0, "epoch": 5.648529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.013600361533463001, "kl": 0.008108565700240433, "learning_rate": 9.020047524812175e-07, "loss": 8.159471326507628e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/mean_length": 414.8125, "completions/min_length": 370.0, "epoch": 5.65, "frac_reward_zero_std": 1.0, "grad_norm": 0.09673669189214706, "kl": 0.011246769223362207, "learning_rate": 9.019284303086085e-07, "loss": 0.00011505679140100256, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 476.625, "completions/min_length": 379.0, "epoch": 5.651470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.018148640170693398, "kl": 0.00834526470862329, "learning_rate": 9.018520816579717e-07, "loss": 8.365686517208815e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 3843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 476.8125, "completions/min_length": 423.0, "epoch": 5.652941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 0.9526585340499878, "kl": 0.012283680494874716, "learning_rate": 9.017757065343368e-07, "loss": 0.00012114261335227638, "reward": 0.8767499923706055, "reward_std": 0.17010116577148438, "rewards/DrugCombAccuracyCOTORM/mean": 0.8537499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.31442803144454956, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.13437095284461975, "step": 3844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/mean_length": 406.0625, "completions/min_length": 378.0, "epoch": 5.654411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.011215345934033394, "kl": 0.006054154830053449, "learning_rate": 9.016993049427351e-07, "loss": 6.0658680013148114e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 433.25, "completions/min_length": 364.0, "epoch": 5.655882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.01921057514846325, "kl": 0.008078590617515147, "learning_rate": 9.016228768881998e-07, "loss": 8.109984628390521e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 426.1875, "completions/min_length": 378.0, "epoch": 5.6573529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.0854573249816895, "kl": 0.008824406657367945, "learning_rate": 9.015464223757655e-07, "loss": 8.660554885864258e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 459.5625, "completions/min_length": 388.0, "epoch": 5.658823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.0457097291946411, "kl": 0.008861362468451262, "learning_rate": 9.014699414104692e-07, "loss": 8.878024527803063e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/mean_length": 395.5625, "completions/min_length": 373.0, "epoch": 5.660294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.034801214933395386, "kl": 0.009526580572128296, "learning_rate": 9.013934339973493e-07, "loss": 9.54892166191712e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/mean_length": 505.625, "completions/min_length": 405.0, "epoch": 5.661764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.7357795238494873, "kl": 0.010409384965896606, "learning_rate": 9.013169001414456e-07, "loss": 0.00010308250784873962, "reward": 0.8665833473205566, "reward_std": 0.20024511218070984, "rewards/DrugCombAccuracyCOTORM/mean": 0.8436458110809326, "rewards/DrugCombAccuracyCOTORM/std": 0.2396712452173233, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.13608276844024658, "step": 3850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 459.4375, "completions/min_length": 407.0, "epoch": 5.663235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.010067139752209187, "kl": 0.007768676267005503, "learning_rate": 9.012403398478001e-07, "loss": 7.69943289924413e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 447.625, "completions/min_length": 386.0, "epoch": 5.6647058823529415, "frac_reward_zero_std": 0.5, "grad_norm": 1.0520260334014893, "kl": 0.0077270191395655274, "learning_rate": 9.011637531214565e-07, "loss": 7.686018943786621e-05, "reward": 0.9052083492279053, "reward_std": 0.10225021839141846, "rewards/DrugCombAccuracyCOTORM/mean": 0.8854166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.2083333432674408, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 3852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 436.6875, "completions/min_length": 395.0, "epoch": 5.666176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.027245480567216873, "kl": 0.00988927495200187, "learning_rate": 9.010871399674603e-07, "loss": 9.892525849863887e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 451.375, "completions/min_length": 370.0, "epoch": 5.66764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.04863571375608444, "kl": 0.00967662944458425, "learning_rate": 9.01010500390858e-07, "loss": 9.664571553003043e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 455.0, "completions/min_length": 404.0, "epoch": 5.669117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.011113815009593964, "kl": 0.006713266251608729, "learning_rate": 9.009338343966988e-07, "loss": 6.714803748764098e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 438.5, "completions/min_length": 367.0, "epoch": 5.670588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9649686217308044, "kl": 0.008305932860821486, "learning_rate": 9.008571419900333e-07, "loss": 8.345519017893821e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 449.875, "completions/min_length": 367.0, "epoch": 5.672058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.012029509991407394, "kl": 0.0070986319333314896, "learning_rate": 9.007804231759138e-07, "loss": 7.040333730401471e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 473.625, "completions/min_length": 408.0, "epoch": 5.673529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9749037027359009, "kl": 0.00711013178806752, "learning_rate": 9.007036779593942e-07, "loss": 7.092860323609784e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 463.875, "completions/min_length": 398.0, "epoch": 5.675, "frac_reward_zero_std": 0.5, "grad_norm": 0.948980450630188, "kl": 0.009660754702053964, "learning_rate": 9.006269063455302e-07, "loss": 9.725231939228252e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 3859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 432.5625, "completions/min_length": 384.0, "epoch": 5.676470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.020263072103261948, "kl": 0.009683829848654568, "learning_rate": 9.005501083393797e-07, "loss": 9.697576751932502e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 439.0625, "completions/min_length": 382.0, "epoch": 5.677941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.010316500440239906, "kl": 0.006854833569377661, "learning_rate": 9.004732839460016e-07, "loss": 6.86665007378906e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 432.875, "completions/min_length": 370.0, "epoch": 5.679411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.01417613960802555, "kl": 0.009412299608811736, "learning_rate": 9.003964331704572e-07, "loss": 9.378995309816673e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 433.3125, "completions/min_length": 318.0, "epoch": 5.680882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.941504955291748, "kl": 0.011943977326154709, "learning_rate": 9.00319556017809e-07, "loss": 0.00012285853154025972, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 444.875, "completions/min_length": 370.0, "epoch": 5.682352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.03140152618288994, "kl": 0.0074419863522052765, "learning_rate": 9.002426524931213e-07, "loss": 7.48782986192964e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 481.1875, "completions/min_length": 437.0, "epoch": 5.6838235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 1.101193904876709, "kl": 0.01010475936345756, "learning_rate": 9.001657226014608e-07, "loss": 0.00010176931391470134, "reward": 0.598437488079071, "reward_std": 0.0044194175861775875, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.0625, "step": 3865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 447.8125, "completions/min_length": 379.0, "epoch": 5.685294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.012193206697702408, "kl": 0.009463352733291686, "learning_rate": 9.000887663478952e-07, "loss": 9.4265938969329e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 515.125, "completions/min_length": 445.0, "epoch": 5.686764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.2512693405151367, "kl": 0.014804320875555277, "learning_rate": 9.000117837374938e-07, "loss": 0.0001510828733444214, "reward": 0.949999988079071, "reward_std": 0.09258200973272324, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.17078252136707306, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 429.375, "completions/min_length": 389.0, "epoch": 5.688235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.4603098630905151, "kl": 0.010164022445678711, "learning_rate": 8.999347747753287e-07, "loss": 0.0001017451286315918, "reward": 0.6000000238418579, "reward_std": 0.4742809236049652, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 3868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/mean_length": 603.0, "completions/min_length": 502.0, "epoch": 5.689705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.7831278443336487, "kl": 0.007508685579523444, "learning_rate": 8.998577394664728e-07, "loss": 7.445162191288546e-05, "reward": 0.5739583373069763, "reward_std": 0.04573853686451912, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7395833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.6803287863731384, "step": 3869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 415.0625, "completions/min_length": 334.0, "epoch": 5.6911764705882355, "frac_reward_zero_std": 0.5, "grad_norm": 1.3076239824295044, "kl": 0.010934429243206978, "learning_rate": 8.997806778160009e-07, "loss": 0.00011008493311237544, "reward": 0.5839166641235352, "reward_std": 0.17849332094192505, "rewards/DrugCombAccuracyCOTORM/mean": 0.5762500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.49902406334877014, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.2291666567325592, "rewards/DrugCombCoverageCOTORM/std": 0.9867173433303833, "step": 3870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 410.6875, "completions/min_length": 378.0, "epoch": 5.692647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.02288675308227539, "kl": 0.009682251140475273, "learning_rate": 8.997035898289895e-07, "loss": 9.6741940069478e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 454.0625, "completions/min_length": 385.0, "epoch": 5.694117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.014154668897390366, "kl": 0.008575390907935798, "learning_rate": 8.996264755105173e-07, "loss": 8.559658454032615e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/mean_length": 403.625, "completions/min_length": 360.0, "epoch": 5.695588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.013380042277276516, "kl": 0.008132948656566441, "learning_rate": 8.995493348656641e-07, "loss": 8.136142423609272e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 447.625, "completions/min_length": 400.0, "epoch": 5.697058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.028007732704281807, "kl": 0.008536932524293661, "learning_rate": 8.994721678995119e-07, "loss": 8.56611950439401e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/mean_length": 497.5, "completions/min_length": 361.0, "epoch": 5.698529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.439483880996704, "kl": 0.00946727057453245, "learning_rate": 8.993949746171443e-07, "loss": 9.360536932945251e-05, "reward": 0.31852084398269653, "reward_std": 0.20311817526817322, "rewards/DrugCombAccuracyCOTORM/mean": 0.19632811844348907, "rewards/DrugCombAccuracyCOTORM/std": 0.33377641439437866, "rewards/DrugCombCOTFormatORM/mean": 0.9375, "rewards/DrugCombCOTFormatORM/std": 0.17078252136707306, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6458333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.5618394613265991, "step": 3875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 426.6875, "completions/min_length": 344.0, "epoch": 5.7, "frac_reward_zero_std": 1.0, "grad_norm": 0.012085063382983208, "kl": 0.008342407178133726, "learning_rate": 8.993177550236463e-07, "loss": 8.336783503182232e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/mean_length": 511.5625, "completions/min_length": 418.0, "epoch": 5.701470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.5109829902648926, "kl": 0.009223720524460077, "learning_rate": 8.992405091241054e-07, "loss": 9.204074740409851e-05, "reward": 0.7434325814247131, "reward_std": 0.32635653018951416, "rewards/DrugCombAccuracyCOTORM/mean": 0.716269850730896, "rewards/DrugCombAccuracyCOTORM/std": 0.39034390449523926, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7041666507720947, "rewards/DrugCombCoverageCOTORM/std": 0.4202072024345398, "step": 3877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 476.3125, "completions/min_length": 429.0, "epoch": 5.702941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9550427198410034, "kl": 0.009734260034747422, "learning_rate": 8.9916323692361e-07, "loss": 9.697159111965448e-05, "reward": 0.9802083373069763, "reward_std": 0.055979274213314056, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 3878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 474.875, "completions/min_length": 392.0, "epoch": 5.704411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.4127106666564941, "kl": 0.00797134346794337, "learning_rate": 8.990859384272506e-07, "loss": 8.036196231842041e-05, "reward": 0.612500011920929, "reward_std": 0.4725835919380188, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 3879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 476.9375, "completions/min_length": 422.0, "epoch": 5.705882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.6701743602752686, "kl": 0.0050568426959216595, "learning_rate": 8.990086136401198e-07, "loss": 5.017220973968506e-05, "reward": 0.5, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 3880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 460.9375, "completions/min_length": 415.0, "epoch": 5.70735294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.8129361867904663, "kl": 0.014235659502446651, "learning_rate": 8.989312625673111e-07, "loss": 0.0001427978277206421, "reward": 0.84375, "reward_std": 0.3442630469799042, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 451.5, "completions/min_length": 386.0, "epoch": 5.708823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.013705256395041943, "kl": 0.006348049733787775, "learning_rate": 8.988538852139204e-07, "loss": 6.346988084260374e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 451.25, "completions/min_length": 411.0, "epoch": 5.7102941176470585, "frac_reward_zero_std": 0.5, "grad_norm": 0.9049094915390015, "kl": 0.010366416187025607, "learning_rate": 8.987764815850451e-07, "loss": 0.00010319799184799194, "reward": 0.7095625400543213, "reward_std": 0.18187545239925385, "rewards/DrugCombAccuracyCOTORM/mean": 0.6404687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.48291152715682983, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.971875011920929, "rewards/DrugCombCoverageCOTORM/std": 0.07739239931106567, "step": 3883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 445.4375, "completions/min_length": 417.0, "epoch": 5.711764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.009626105427742004, "kl": 0.0074139394564554095, "learning_rate": 8.986990516857844e-07, "loss": 7.401729089906439e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 425.0, "completions/min_length": 360.0, "epoch": 5.713235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.012422804720699787, "kl": 0.0066967615857720375, "learning_rate": 8.986215955212392e-07, "loss": 6.660245708189905e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 470.9375, "completions/min_length": 399.0, "epoch": 5.714705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.010361379012465477, "kl": 0.006290343124419451, "learning_rate": 8.985441130965121e-07, "loss": 6.311391916824505e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/mean_length": 491.5625, "completions/min_length": 393.0, "epoch": 5.716176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9711571931838989, "kl": 0.010695021832361817, "learning_rate": 8.984666044167073e-07, "loss": 0.00010652751370798796, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 3887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 435.5, "completions/min_length": 374.0, "epoch": 5.7176470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.017463387921452522, "kl": 0.008459048462100327, "learning_rate": 8.983890694869312e-07, "loss": 8.468278974760324e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 454.1875, "completions/min_length": 384.0, "epoch": 5.719117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.019417844712734222, "kl": 0.008068848284892738, "learning_rate": 8.983115083122912e-07, "loss": 8.077597158262506e-05, "reward": 0.7666666507720947, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.3442651927471161, "step": 3889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 448.0, "completions/min_length": 390.0, "epoch": 5.720588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.019458001479506493, "kl": 0.007952274521812797, "learning_rate": 8.982339208978971e-07, "loss": 7.934451423352584e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/mean_length": 490.375, "completions/min_length": 406.0, "epoch": 5.722058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.008892957121133804, "kl": 0.006684628780931234, "learning_rate": 8.9815630724886e-07, "loss": 6.612946890527382e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 440.875, "completions/min_length": 405.0, "epoch": 5.723529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.01271094661206007, "kl": 0.010365223279222846, "learning_rate": 8.980786673702931e-07, "loss": 0.0001046027391566895, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/mean_length": 506.5, "completions/min_length": 421.0, "epoch": 5.725, "frac_reward_zero_std": 0.5, "grad_norm": 1.0773091316223145, "kl": 0.015235131955705583, "learning_rate": 8.980010012673109e-07, "loss": 0.00014984933659434319, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 437.5625, "completions/min_length": 390.0, "epoch": 5.726470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.040539875626564026, "kl": 0.009767443058080971, "learning_rate": 8.979233089450301e-07, "loss": 9.747041622176766e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 427.5625, "completions/min_length": 382.0, "epoch": 5.727941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.01190402265638113, "kl": 0.006261237082071602, "learning_rate": 8.978455904085687e-07, "loss": 6.237647903617471e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 452.3125, "completions/min_length": 389.0, "epoch": 5.729411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9961113333702087, "kl": 0.011176290339790285, "learning_rate": 8.977678456630465e-07, "loss": 0.00011173455277457833, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 475.625, "completions/min_length": 412.0, "epoch": 5.730882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.1807759553194046, "kl": 0.01577244489453733, "learning_rate": 8.976900747135853e-07, "loss": 0.00015954113041516393, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 466.9375, "completions/min_length": 372.0, "epoch": 5.732352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.8358351588249207, "kl": 0.0077201531967148185, "learning_rate": 8.976122775653086e-07, "loss": 7.800757884979248e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 426.9375, "completions/min_length": 355.0, "epoch": 5.733823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.014449354261159897, "kl": 0.008036367129534483, "learning_rate": 8.975344542233409e-07, "loss": 8.040086686378345e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 462.0, "completions/min_length": 385.0, "epoch": 5.735294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.8711913824081421, "kl": 0.008736354415304959, "learning_rate": 8.974566046928099e-07, "loss": 8.597224950790405e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 483.125, "completions/min_length": 433.0, "epoch": 5.7367647058823525, "frac_reward_zero_std": 0.0, "grad_norm": 1.2520204782485962, "kl": 0.009127099765464664, "learning_rate": 8.973787289788432e-07, "loss": 9.12398099899292e-05, "reward": 0.8312499523162842, "reward_std": 0.36911603808403015, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5439056158065796, "step": 3901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 417.5, "completions/min_length": 345.0, "epoch": 5.738235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01126767136156559, "kl": 0.007448661723174155, "learning_rate": 8.973008270865716e-07, "loss": 7.447629468515515e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 422.5, "completions/min_length": 354.0, "epoch": 5.739705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.011939473450183868, "kl": 0.0064925727201625705, "learning_rate": 8.97222899021127e-07, "loss": 6.486146594397724e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 437.0, "completions/min_length": 370.0, "epoch": 5.741176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.03162678703665733, "kl": 0.008292950689792633, "learning_rate": 8.971449447876431e-07, "loss": 8.354034798685461e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 466.4375, "completions/min_length": 393.0, "epoch": 5.742647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.1481125354766846, "kl": 0.008963808417320251, "learning_rate": 8.970669643912551e-07, "loss": 8.97639911272563e-05, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 3905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 474.375, "completions/min_length": 412.0, "epoch": 5.7441176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.344271183013916, "kl": 0.0075738641899079084, "learning_rate": 8.969889578371003e-07, "loss": 7.568299770355225e-05, "reward": 0.971875011920929, "reward_std": 0.07954951375722885, "rewards/DrugCombAccuracyCOTORM/mean": 0.96875, "rewards/DrugCombAccuracyCOTORM/std": 0.125, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 3906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 448.25, "completions/min_length": 404.0, "epoch": 5.745588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9470993280410767, "kl": 0.00867112074047327, "learning_rate": 8.96910925130318e-07, "loss": 8.659809827804565e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 489.875, "completions/min_length": 442.0, "epoch": 5.747058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.012952277436852455, "kl": 0.006960263010114431, "learning_rate": 8.968328662760481e-07, "loss": 6.975811265874654e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 417.5, "completions/min_length": 383.0, "epoch": 5.748529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.573554277420044, "kl": 0.009801695123314857, "learning_rate": 8.967547812794333e-07, "loss": 9.788572788238525e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 469.8125, "completions/min_length": 412.0, "epoch": 5.75, "frac_reward_zero_std": 0.5, "grad_norm": 0.8907750248908997, "kl": 0.007556588621810079, "learning_rate": 8.966766701456176e-07, "loss": 7.600721437484026e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/mean_length": 504.3125, "completions/min_length": 438.0, "epoch": 5.751470588235295, "frac_reward_zero_std": 0.0, "grad_norm": 1.2422692775726318, "kl": 0.0075349732069298625, "learning_rate": 8.965985328797466e-07, "loss": 7.531419396400452e-05, "reward": 0.7539889216423035, "reward_std": 0.2575134336948395, "rewards/DrugCombAccuracyCOTORM/mean": 0.722000002861023, "rewards/DrugCombAccuracyCOTORM/std": 0.34461650252342224, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7638888955116272, "rewards/DrugCombCoverageCOTORM/std": 0.26566168665885925, "step": 3911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 455.25, "completions/min_length": 376.0, "epoch": 5.752941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9200536608695984, "kl": 0.0073779725935310125, "learning_rate": 8.96520369486968e-07, "loss": 7.372675463557243e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 461.0, "completions/min_length": 396.0, "epoch": 5.754411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.017148252576589584, "kl": 0.007327473373152316, "learning_rate": 8.96442179972431e-07, "loss": 7.348221697611734e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 446.5, "completions/min_length": 381.0, "epoch": 5.7558823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.306858777999878, "kl": 0.007793888100422919, "learning_rate": 8.963639643412864e-07, "loss": 7.752329111099243e-05, "reward": 0.9177083373069763, "reward_std": 0.17764092981815338, "rewards/DrugCombAccuracyCOTORM/mean": 0.9166666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.5072392821311951, "step": 3914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 435.25, "completions/min_length": 394.0, "epoch": 5.757352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.6621915102005005, "kl": 0.008997887605801225, "learning_rate": 8.962857225986869e-07, "loss": 9.060278534889221e-05, "reward": 0.5874999761581421, "reward_std": 0.44393861293792725, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 3915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 477.9375, "completions/min_length": 399.0, "epoch": 5.758823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.1859394311904907, "kl": 0.008653088822029531, "learning_rate": 8.962074547497868e-07, "loss": 8.651510142954066e-05, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 3916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 463.0, "completions/min_length": 316.0, "epoch": 5.760294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.03236732631921768, "kl": 0.008110423223115504, "learning_rate": 8.961291607997423e-07, "loss": 8.130872447509319e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 433.625, "completions/min_length": 336.0, "epoch": 5.761764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.01377071626484394, "kl": 0.0054779197089374065, "learning_rate": 8.960508407537113e-07, "loss": 5.4816322517581284e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 3918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 474.4375, "completions/min_length": 428.0, "epoch": 5.7632352941176475, "frac_reward_zero_std": 0.5, "grad_norm": 0.9550714492797852, "kl": 0.008327411604113877, "learning_rate": 8.959724946168531e-07, "loss": 8.251453982666135e-05, "reward": 0.9104166626930237, "reward_std": 0.15759539604187012, "rewards/DrugCombAccuracyCOTORM/mean": 0.8958333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.26440009474754333, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/mean_length": 544.5, "completions/min_length": 476.0, "epoch": 5.764705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.098476529121399, "kl": 0.007978744804859161, "learning_rate": 8.95894122394329e-07, "loss": 7.978081703186035e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/mean_length": 407.25, "completions/min_length": 371.0, "epoch": 5.766176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.01158130168914795, "kl": 0.007993643288500607, "learning_rate": 8.95815724091302e-07, "loss": 7.986311538843438e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 451.0, "completions/min_length": 360.0, "epoch": 5.767647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.020645612850785255, "kl": 0.008904236485250294, "learning_rate": 8.957372997129369e-07, "loss": 8.895121573004872e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/mean_length": 451.9375, "completions/min_length": 317.0, "epoch": 5.769117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.0717732906341553, "kl": 0.007819716585800052, "learning_rate": 8.956588492643997e-07, "loss": 7.794932025717571e-05, "reward": 0.8858500123023987, "reward_std": 0.15754196047782898, "rewards/DrugCombAccuracyCOTORM/mean": 0.8619999885559082, "rewards/DrugCombAccuracyCOTORM/std": 0.29669108986854553, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9624999761581421, "rewards/DrugCombCoverageCOTORM/std": 0.08062257617712021, "step": 3923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 426.0, "completions/min_length": 368.0, "epoch": 5.770588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.0081385374069214, "kl": 0.010511380503885448, "learning_rate": 8.95580372750859e-07, "loss": 0.00010802596807479858, "reward": 0.5225722193717957, "reward_std": 0.04461071267724037, "rewards/DrugCombAccuracyCOTORM/mean": 0.5082499980926514, "rewards/DrugCombAccuracyCOTORM/std": 0.5088768601417542, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.159722238779068, "rewards/DrugCombCoverageCOTORM/std": 0.9575077295303345, "step": 3924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 477.6875, "completions/min_length": 441.0, "epoch": 5.772058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.890708863735199, "kl": 0.007193810190074146, "learning_rate": 8.955018701774846e-07, "loss": 7.217377424240112e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 440.125, "completions/min_length": 399.0, "epoch": 5.773529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.015542468056082726, "kl": 0.00979862967506051, "learning_rate": 8.954233415494476e-07, "loss": 9.789859177544713e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/mean_length": 507.75, "completions/min_length": 440.0, "epoch": 5.775, "frac_reward_zero_std": 0.5, "grad_norm": 1.1971951723098755, "kl": 0.02481564332265407, "learning_rate": 8.953447868719217e-07, "loss": 0.000243358314037323, "reward": 0.7302083373069763, "reward_std": 0.16127823293209076, "rewards/DrugCombAccuracyCOTORM/mean": 0.6770833730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.4323439598083496, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8854166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.2083333432674408, "step": 3927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 441.3125, "completions/min_length": 399.0, "epoch": 5.776470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.0674755573272705, "kl": 0.010213293950073421, "learning_rate": 8.952662061500817e-07, "loss": 0.00010309359640814364, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 3928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 432.1875, "completions/min_length": 349.0, "epoch": 5.777941176470589, "frac_reward_zero_std": 0.0, "grad_norm": 1.6280030012130737, "kl": 0.010434981319122016, "learning_rate": 8.951875993891043e-07, "loss": 0.00010599568486213684, "reward": 0.3375000059604645, "reward_std": 0.4115486741065979, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 3929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/mean_length": 523.0625, "completions/min_length": 410.0, "epoch": 5.779411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.8612141609191895, "kl": 0.008625412941910326, "learning_rate": 8.951089665941679e-07, "loss": 8.665025234222412e-05, "reward": 0.8192914724349976, "reward_std": 0.16639384627342224, "rewards/DrugCombAccuracyCOTORM/mean": 0.7785414457321167, "rewards/DrugCombAccuracyCOTORM/std": 0.3606595993041992, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9645833373069763, "rewards/DrugCombCoverageCOTORM/std": 0.0764671340584755, "step": 3930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/mean_length": 401.75, "completions/min_length": 356.0, "epoch": 5.780882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.010411053895950317, "kl": 0.006983546540141106, "learning_rate": 8.950303077704527e-07, "loss": 6.997660966590047e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/mean_length": 409.1875, "completions/min_length": 367.0, "epoch": 5.7823529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.145817756652832, "kl": 0.0068685016594827175, "learning_rate": 8.949516229231406e-07, "loss": 6.916809070389718e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 486.5, "completions/min_length": 417.0, "epoch": 5.783823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.3221030235290527, "kl": 0.006937092752195895, "learning_rate": 8.94872912057415e-07, "loss": 6.92903995513916e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 443.5, "completions/min_length": 380.0, "epoch": 5.785294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.01824449561536312, "kl": 0.008555634180083871, "learning_rate": 8.947941751784613e-07, "loss": 8.510761836078018e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 417.6875, "completions/min_length": 369.0, "epoch": 5.786764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.009832928888499737, "kl": 0.0067979536252096295, "learning_rate": 8.947154122914665e-07, "loss": 6.761238910257816e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/mean_length": 504.1875, "completions/min_length": 393.0, "epoch": 5.788235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0757770538330078, "kl": 0.009231204399839044, "learning_rate": 8.946366234016191e-07, "loss": 9.250640869140625e-05, "reward": 0.7012500166893005, "reward_std": 0.11224884539842606, "rewards/DrugCombAccuracyCOTORM/mean": 0.6447916626930237, "rewards/DrugCombAccuracyCOTORM/std": 0.4139564633369446, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.24247948825359344, "step": 3936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 488.5625, "completions/min_length": 453.0, "epoch": 5.7897058823529415, "frac_reward_zero_std": 1.0, "grad_norm": 0.0430135615170002, "kl": 0.009727258002385497, "learning_rate": 8.945578085141096e-07, "loss": 9.792651690077037e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 475.1875, "completions/min_length": 385.0, "epoch": 5.791176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.1325989961624146, "kl": 0.0081101591931656, "learning_rate": 8.944789676341305e-07, "loss": 8.080154657363892e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 422.125, "completions/min_length": 373.0, "epoch": 5.79264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0656564235687256, "kl": 0.011630128836259246, "learning_rate": 8.944001007668752e-07, "loss": 0.00011633583198999986, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 3939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 446.6875, "completions/min_length": 401.0, "epoch": 5.794117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.845698356628418, "kl": 0.009818589547649026, "learning_rate": 8.94321207917539e-07, "loss": 9.694910841062665e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/mean_length": 563.0625, "completions/min_length": 499.0, "epoch": 5.795588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.6096131801605225, "kl": 0.01292865420691669, "learning_rate": 8.942422890913199e-07, "loss": 0.00012953579425811768, "reward": 0.8560925722122192, "reward_std": 0.2597299814224243, "rewards/DrugCombAccuracyCOTORM/mean": 0.8279281854629517, "rewards/DrugCombAccuracyCOTORM/std": 0.31417855620384216, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.1031898707151413, "step": 3941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 457.75, "completions/min_length": 384.0, "epoch": 5.797058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.6881889700889587, "kl": 0.010697174002416432, "learning_rate": 8.941633442934164e-07, "loss": 0.00010612607002258301, "reward": 0.669950008392334, "reward_std": 0.1333603411912918, "rewards/DrugCombAccuracyCOTORM/mean": 0.6202499866485596, "rewards/DrugCombAccuracyCOTORM/std": 0.444717675447464, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.737500011920929, "rewards/DrugCombCoverageCOTORM/std": 0.30740854144096375, "step": 3942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 425.9375, "completions/min_length": 385.0, "epoch": 5.798529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.05071178078651428, "kl": 0.011107181431725621, "learning_rate": 8.940843735290294e-07, "loss": 0.0001094302861019969, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 496.9375, "completions/min_length": 431.0, "epoch": 5.8, "frac_reward_zero_std": 0.5, "grad_norm": 1.1327651739120483, "kl": 0.01016671396791935, "learning_rate": 8.940053768033608e-07, "loss": 0.00010133348405361176, "reward": 0.6499999761581421, "reward_std": 0.22038927674293518, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 3944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 466.1875, "completions/min_length": 403.0, "epoch": 5.801470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.3792070150375366, "kl": 0.008873976184986532, "learning_rate": 8.939263541216155e-07, "loss": 8.875435014488176e-05, "reward": 0.9552083015441895, "reward_std": 0.08368229866027832, "rewards/DrugCombAccuracyCOTORM/mean": 0.9479166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.145535409450531, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 3945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 477.75, "completions/min_length": 387.0, "epoch": 5.802941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9133856296539307, "kl": 0.009503740817308426, "learning_rate": 8.938473054889988e-07, "loss": 9.461888112127781e-05, "reward": 0.7100833058357239, "reward_std": 0.1885562241077423, "rewards/DrugCombAccuracyCOTORM/mean": 0.6662499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.45040538907051086, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7708333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.49767982959747314, "step": 3946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 412.0, "completions/min_length": 364.0, "epoch": 5.804411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.023326486349105835, "kl": 0.009714751737192273, "learning_rate": 8.937682309107182e-07, "loss": 9.664669778430834e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 455.1875, "completions/min_length": 407.0, "epoch": 5.805882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.7594091892242432, "kl": 0.010795733891427517, "learning_rate": 8.936891303919831e-07, "loss": 0.0001079067587852478, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 489.25, "completions/min_length": 433.0, "epoch": 5.807352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.0217523574829102, "kl": 0.008130051312036812, "learning_rate": 8.936100039380044e-07, "loss": 8.158385753631592e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.0, "completions/mean_length": 511.75, "completions/min_length": 380.0, "epoch": 5.8088235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 0.7627742886543274, "kl": 0.0085755210602656, "learning_rate": 8.935308515539947e-07, "loss": 8.553630323149264e-05, "reward": 0.8232197165489197, "reward_std": 0.024212075397372246, "rewards/DrugCombAccuracyCOTORM/mean": 0.7878788113594055, "rewards/DrugCombAccuracyCOTORM/std": 0.222405806183815, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9291666746139526, "rewards/DrugCombCoverageCOTORM/std": 0.18929694592952728, "step": 3950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 480.3125, "completions/min_length": 420.0, "epoch": 5.810294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.2400568723678589, "kl": 0.009952478110790253, "learning_rate": 8.934516732451684e-07, "loss": 9.908752690535039e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 485.0, "completions/min_length": 431.0, "epoch": 5.811764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.7959290146827698, "kl": 0.00807130488101393, "learning_rate": 8.933724690167416e-07, "loss": 8.059685933403671e-05, "reward": 0.6755833029747009, "reward_std": 0.20776480436325073, "rewards/DrugCombAccuracyCOTORM/mean": 0.6387500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.4844498634338379, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6458333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.6935549974441528, "step": 3952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 472.0625, "completions/min_length": 433.0, "epoch": 5.813235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.029956111684441566, "kl": 0.009112150990404189, "learning_rate": 8.932932388739321e-07, "loss": 9.044677426572889e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/mean_length": 493.0625, "completions/min_length": 414.0, "epoch": 5.814705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.0115299224853516, "kl": 0.009077959461137652, "learning_rate": 8.932139828219592e-07, "loss": 9.043999307323247e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 442.5625, "completions/min_length": 406.0, "epoch": 5.8161764705882355, "frac_reward_zero_std": 0.0, "grad_norm": 1.479251742362976, "kl": 0.016850409330800176, "learning_rate": 8.931347008660442e-07, "loss": 0.0001665055751800537, "reward": 0.5874999761581421, "reward_std": 0.3181980550289154, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 456.8125, "completions/min_length": 407.0, "epoch": 5.817647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.0243293046951294, "kl": 0.00727481278590858, "learning_rate": 8.930553930114102e-07, "loss": 7.276982069015503e-05, "reward": 0.8302083611488342, "reward_std": 0.07013839483261108, "rewards/DrugCombAccuracyCOTORM/mean": 0.7916666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.24720662832260132, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 3956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 429.375, "completions/min_length": 348.0, "epoch": 5.819117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.01126745343208313, "kl": 0.006254049367271364, "learning_rate": 8.929760592632815e-07, "loss": 6.306002615019679e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 429.0, "completions/min_length": 372.0, "epoch": 5.820588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.018063193187117577, "kl": 0.007685804506763816, "learning_rate": 8.928966996268845e-07, "loss": 7.608789019286633e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 432.5, "completions/min_length": 401.0, "epoch": 5.822058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.1601051092147827, "kl": 0.008696650853380561, "learning_rate": 8.928173141074472e-07, "loss": 8.743256330490112e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 452.6875, "completions/min_length": 400.0, "epoch": 5.823529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9508181214332581, "kl": 0.007549766916781664, "learning_rate": 8.927379027101993e-07, "loss": 7.542728417320177e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 3960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 480.5, "completions/min_length": 409.0, "epoch": 5.825, "frac_reward_zero_std": 0.5, "grad_norm": 1.0386112928390503, "kl": 0.009719344321638346, "learning_rate": 8.926584654403724e-07, "loss": 9.846687316894531e-05, "reward": 0.8999999761581421, "reward_std": 0.10690449178218842, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.22360680997371674, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 430.6875, "completions/min_length": 382.0, "epoch": 5.826470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.023821834474802017, "kl": 0.008722164086066186, "learning_rate": 8.925790023031994e-07, "loss": 8.693229028722271e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 3962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 418.25, "completions/min_length": 354.0, "epoch": 5.827941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.7760775685310364, "kl": 0.007638805313035846, "learning_rate": 8.924995133039153e-07, "loss": 7.73199790273793e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/mean_length": 401.9375, "completions/min_length": 352.0, "epoch": 5.829411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.010549100115895271, "kl": 0.008268897421658039, "learning_rate": 8.924199984477565e-07, "loss": 8.225077181123197e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/mean_length": 515.125, "completions/min_length": 414.0, "epoch": 5.830882352941177, "frac_reward_zero_std": 0.0, "grad_norm": 1.3487919569015503, "kl": 0.007929990533739328, "learning_rate": 8.923404577399613e-07, "loss": 8.052587509155273e-05, "reward": 0.18956251442432404, "reward_std": 0.15770193934440613, "rewards/DrugCombAccuracyCOTORM/mean": 0.09828124940395355, "rewards/DrugCombAccuracyCOTORM/std": 0.13306865096092224, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 0.6191391944885254, "step": 3965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 436.5625, "completions/min_length": 369.0, "epoch": 5.83235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.373605489730835, "kl": 0.011654144036583602, "learning_rate": 8.922608911857696e-07, "loss": 0.0001155580539489165, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 3966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 439.4375, "completions/min_length": 349.0, "epoch": 5.833823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.0836691856384277, "kl": 0.011212278390303254, "learning_rate": 8.921812987904231e-07, "loss": 0.00011418973735999316, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 421.5, "completions/min_length": 366.0, "epoch": 5.8352941176470585, "frac_reward_zero_std": 1.0, "grad_norm": 0.014050360769033432, "kl": 0.007670436170883477, "learning_rate": 8.921016805591653e-07, "loss": 7.67470191931352e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 458.4375, "completions/min_length": 388.0, "epoch": 5.836764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9523805379867554, "kl": 0.010116957826539874, "learning_rate": 8.920220364972408e-07, "loss": 0.00010153844050364569, "reward": 0.23750001192092896, "reward_std": 0.1060660183429718, "rewards/DrugCombAccuracyCOTORM/mean": 0.0625, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 3969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 451.9375, "completions/min_length": 366.0, "epoch": 5.838235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.3666995763778687, "kl": 0.010269640129990876, "learning_rate": 8.919423666098969e-07, "loss": 0.00010187923908233643, "reward": 0.5993332862854004, "reward_std": 0.18215742707252502, "rewards/DrugCombAccuracyCOTORM/mean": 0.5199999809265137, "rewards/DrugCombAccuracyCOTORM/std": 0.4445222318172455, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 3970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 466.9375, "completions/min_length": 407.0, "epoch": 5.839705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.1131691932678223, "kl": 0.008616948733106256, "learning_rate": 8.918626709023816e-07, "loss": 8.670240640640259e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/mean_length": 496.6875, "completions/min_length": 391.0, "epoch": 5.841176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.829674243927002, "kl": 0.009367546299472451, "learning_rate": 8.917829493799452e-07, "loss": 9.363889694213867e-05, "reward": 0.9399999976158142, "reward_std": 0.14020393788814545, "rewards/DrugCombAccuracyCOTORM/mean": 0.925000011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.25166115164756775, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/mean_length": 505.0, "completions/min_length": 458.0, "epoch": 5.8426470588235295, "frac_reward_zero_std": 0.0, "grad_norm": 1.3700380325317383, "kl": 0.006861408473923802, "learning_rate": 8.917032020478394e-07, "loss": 6.865710020065308e-05, "reward": 0.890458345413208, "reward_std": 0.22216397523880005, "rewards/DrugCombAccuracyCOTORM/mean": 0.87479168176651, "rewards/DrugCombAccuracyCOTORM/std": 0.2892204523086548, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.20155644416809082, "step": 3973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 422.8125, "completions/min_length": 387.0, "epoch": 5.844117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.023736456409096718, "kl": 0.008780828327871859, "learning_rate": 8.916234289113182e-07, "loss": 8.789376443019137e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 447.3125, "completions/min_length": 372.0, "epoch": 5.845588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.010991760529577732, "kl": 0.006648991722613573, "learning_rate": 8.915436299756365e-07, "loss": 6.64468971081078e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 472.0, "completions/min_length": 396.0, "epoch": 5.847058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 0.8758513927459717, "kl": 0.006821111426688731, "learning_rate": 8.914638052460514e-07, "loss": 6.811261846451089e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 409.25, "completions/min_length": 375.0, "epoch": 5.848529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.02281646989285946, "kl": 0.00998309359420091, "learning_rate": 8.913839547278215e-07, "loss": 9.899974975269288e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 443.5625, "completions/min_length": 383.0, "epoch": 5.85, "frac_reward_zero_std": 0.5, "grad_norm": 1.4543180465698242, "kl": 0.012988788541406393, "learning_rate": 8.913040784262069e-07, "loss": 0.00012885508476756513, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 424.875, "completions/min_length": 374.0, "epoch": 5.851470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.3175357580184937, "kl": 0.008553464314900339, "learning_rate": 8.912241763464701e-07, "loss": 8.512059866916388e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/mean_length": 491.8125, "completions/min_length": 420.0, "epoch": 5.852941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.5416117906570435, "kl": 0.013398749055340886, "learning_rate": 8.911442484938744e-07, "loss": 0.00013340264558792114, "reward": 0.19375000894069672, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.0625, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.6291528940200806, "step": 3980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 450.3125, "completions/min_length": 383.0, "epoch": 5.854411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.013409977778792381, "kl": 0.01002432219684124, "learning_rate": 8.910642948736857e-07, "loss": 0.0001000304619083181, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 450.0625, "completions/min_length": 404.0, "epoch": 5.855882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.001199722290039, "kl": 0.01296484237536788, "learning_rate": 8.909843154911709e-07, "loss": 0.0001261681318283081, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/mean_length": 456.5, "completions/min_length": 363.0, "epoch": 5.857352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.5001379251480103, "kl": 0.011010744608938694, "learning_rate": 8.909043103515987e-07, "loss": 0.00010955333709716797, "reward": 0.5148333311080933, "reward_std": 0.25534749031066895, "rewards/DrugCombAccuracyCOTORM/mean": 0.4300000071525574, "rewards/DrugCombAccuracyCOTORM/std": 0.4644136130809784, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7083333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.3191423714160919, "step": 3983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 452.875, "completions/min_length": 395.0, "epoch": 5.858823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.018027853220701218, "kl": 0.009783873800188303, "learning_rate": 8.9082427946024e-07, "loss": 9.756452345754951e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 442.0, "completions/min_length": 393.0, "epoch": 5.860294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.016821354627609253, "kl": 0.009226183872669935, "learning_rate": 8.907442228223667e-07, "loss": 9.289899026043713e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 454.0, "completions/min_length": 418.0, "epoch": 5.8617647058823525, "frac_reward_zero_std": 0.5, "grad_norm": 1.2101706266403198, "kl": 0.009077977039851248, "learning_rate": 8.906641404432529e-07, "loss": 8.957970567280427e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 3986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 456.75, "completions/min_length": 410.0, "epoch": 5.863235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8705736994743347, "kl": 0.008246966288425028, "learning_rate": 8.905840323281741e-07, "loss": 8.222460746765137e-05, "reward": 0.7037500143051147, "reward_std": 0.12213702499866486, "rewards/DrugCombAccuracyCOTORM/mean": 0.6508928537368774, "rewards/DrugCombAccuracyCOTORM/std": 0.4099354147911072, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8303571343421936, "rewards/DrugCombCoverageCOTORM/std": 0.21854236721992493, "step": 3987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 471.8125, "completions/min_length": 410.0, "epoch": 5.864705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.017766566947102547, "kl": 0.0083307686727494, "learning_rate": 8.905038984824076e-07, "loss": 8.391495066462085e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 440.625, "completions/min_length": 403.0, "epoch": 5.866176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.021816108375787735, "kl": 0.010279623558744788, "learning_rate": 8.904237389112328e-07, "loss": 0.00010365244088461623, "reward": 0.6713333129882812, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6100000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.40279027819633484, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 3989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 470.625, "completions/min_length": 417.0, "epoch": 5.867647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.018489181995391846, "kl": 0.010627822251990438, "learning_rate": 8.903435536199298e-07, "loss": 0.00010540453513385728, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 439.6875, "completions/min_length": 381.0, "epoch": 5.8691176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.008366754278540611, "kl": 0.0069107327144593, "learning_rate": 8.902633426137815e-07, "loss": 6.85279446770437e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/mean_length": 475.5, "completions/min_length": 381.0, "epoch": 5.870588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9548349976539612, "kl": 0.009953002678230405, "learning_rate": 8.901831058980717e-07, "loss": 9.909272193908691e-05, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 3992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 449.1875, "completions/min_length": 381.0, "epoch": 5.872058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.016667675226926804, "kl": 0.007574264891445637, "learning_rate": 8.901028434780866e-07, "loss": 7.560924859717488e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 455.75, "completions/min_length": 399.0, "epoch": 5.873529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.0113009214401245, "kl": 0.009766886592842638, "learning_rate": 8.900225553591132e-07, "loss": 9.739398956298828e-05, "reward": 0.30000001192092896, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 3994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 448.0625, "completions/min_length": 403.0, "epoch": 5.875, "frac_reward_zero_std": 0.5, "grad_norm": 1.0001741647720337, "kl": 0.009212569333612919, "learning_rate": 8.899422415464408e-07, "loss": 9.13664698600769e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 3995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 449.6875, "completions/min_length": 402.0, "epoch": 5.876470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.9831201434135437, "kl": 0.007507595117203891, "learning_rate": 8.898619020453605e-07, "loss": 7.507350528612733e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 419.75, "completions/min_length": 355.0, "epoch": 5.877941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.7466825246810913, "kl": 0.012880297610536218, "learning_rate": 8.897815368611646e-07, "loss": 0.000128820538520813, "reward": 0.6499999761581421, "reward_std": 0.39218369126319885, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 450.3125, "completions/min_length": 419.0, "epoch": 5.879411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.030035430565476418, "kl": 0.007851371075958014, "learning_rate": 8.897011459991476e-07, "loss": 7.835284486645833e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 440.625, "completions/min_length": 381.0, "epoch": 5.8808823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.01309085264801979, "kl": 0.008177577401511371, "learning_rate": 8.896207294646052e-07, "loss": 8.086329762591049e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 3999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 465.0625, "completions/min_length": 407.0, "epoch": 5.882352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.245501160621643, "kl": 0.011718638241291046, "learning_rate": 8.895402872628351e-07, "loss": 0.00011676549911499023, "reward": 0.29375001788139343, "reward_std": 0.21286731958389282, "rewards/DrugCombAccuracyCOTORM/mean": 0.1875, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 4000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 436.9375, "completions/min_length": 403.0, "epoch": 5.883823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.015341254882514477, "kl": 0.008752592140808702, "learning_rate": 8.894598193991368e-07, "loss": 8.784122474025935e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/mean_length": 525.25, "completions/min_length": 420.0, "epoch": 5.885294117647058, "frac_reward_zero_std": 0.0, "grad_norm": 1.529175877571106, "kl": 0.011652307352051139, "learning_rate": 8.893793258788112e-07, "loss": 0.00011602789163589478, "reward": 0.3968541622161865, "reward_std": 0.09875757992267609, "rewards/DrugCombAccuracyCOTORM/mean": 0.36716145277023315, "rewards/DrugCombAccuracyCOTORM/std": 0.39650586247444153, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.03125, "rewards/DrugCombCoverageCOTORM/std": 0.9510593414306641, "step": 4002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 465.75, "completions/min_length": 384.0, "epoch": 5.886764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.020221970975399017, "kl": 0.009476715815253556, "learning_rate": 8.892988067071609e-07, "loss": 9.527010843157768e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 472.5, "completions/min_length": 395.0, "epoch": 5.8882352941176475, "frac_reward_zero_std": 0.5, "grad_norm": 0.9527864456176758, "kl": 0.013202352216467261, "learning_rate": 8.892182618894905e-07, "loss": 0.00013092142762616277, "reward": 0.5874999761581421, "reward_std": 0.172688826918602, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 4004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 422.875, "completions/min_length": 363.0, "epoch": 5.889705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.022441096603870392, "kl": 0.007578200078569353, "learning_rate": 8.891376914311059e-07, "loss": 7.642493437742814e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 470.5625, "completions/min_length": 420.0, "epoch": 5.891176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.15353262424469, "kl": 0.011697768233716488, "learning_rate": 8.890570953373151e-07, "loss": 0.00011786361574195325, "reward": 0.7749999761581421, "reward_std": 0.24348658323287964, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 4006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 428.5, "completions/min_length": 394.0, "epoch": 5.892647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.02006358653306961, "kl": 0.008118596160784364, "learning_rate": 8.889764736134274e-07, "loss": 8.07643955340609e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 462.5, "completions/min_length": 402.0, "epoch": 5.894117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.1381371021270752, "kl": 0.010396165773272514, "learning_rate": 8.888958262647539e-07, "loss": 0.00010420382022857666, "reward": 0.629687488079071, "reward_std": 0.22892777621746063, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3125, "rewards/DrugCombCoverageCOTORM/std": 0.9464847445487976, "step": 4008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/mean_length": 488.25, "completions/min_length": 414.0, "epoch": 5.895588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.8340184688568115, "kl": 0.010406389134004712, "learning_rate": 8.888151532966077e-07, "loss": 0.0001043959564412944, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 396.125, "completions/min_length": 306.0, "epoch": 5.897058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.00860673002898693, "kl": 0.006466034101322293, "learning_rate": 8.887344547143031e-07, "loss": 6.462883175117895e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 461.375, "completions/min_length": 414.0, "epoch": 5.898529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9227510690689087, "kl": 0.010769624961540103, "learning_rate": 8.886537305231563e-07, "loss": 0.00010845810174942017, "reward": 0.637499988079071, "reward_std": 0.1505940705537796, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 477.9375, "completions/min_length": 406.0, "epoch": 5.9, "frac_reward_zero_std": 0.5, "grad_norm": 0.9793174266815186, "kl": 0.008760341792367399, "learning_rate": 8.885729807284854e-07, "loss": 8.704513311386108e-05, "reward": 0.803727388381958, "reward_std": 0.1621362864971161, "rewards/DrugCombAccuracyCOTORM/mean": 0.7741904854774475, "rewards/DrugCombAccuracyCOTORM/std": 0.34420472383499146, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.2667968273162842, "step": 4012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 447.0625, "completions/min_length": 424.0, "epoch": 5.901470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.803182065486908, "kl": 0.010295877582393587, "learning_rate": 8.884922053356098e-07, "loss": 0.00010289251804351807, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 444.25, "completions/min_length": 371.0, "epoch": 5.902941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 1.1578534841537476, "kl": 0.010038482956588268, "learning_rate": 8.88411404349851e-07, "loss": 9.956994472304359e-05, "reward": 0.75, "reward_std": 0.20701967179775238, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 441.0625, "completions/min_length": 392.0, "epoch": 5.904411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9674602746963501, "kl": 0.010665084118954837, "learning_rate": 8.883305777765317e-07, "loss": 0.0001057991903508082, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 4015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 445.0, "completions/min_length": 371.0, "epoch": 5.905882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.02048354223370552, "kl": 0.008654050645418465, "learning_rate": 8.882497256209767e-07, "loss": 8.630601951153949e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 470.5, "completions/min_length": 447.0, "epoch": 5.9073529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.7663729786872864, "kl": 0.00974957866128534, "learning_rate": 8.881688478885124e-07, "loss": 9.769946336746216e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 433.375, "completions/min_length": 388.0, "epoch": 5.908823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.019547319039702415, "kl": 0.009237566962838173, "learning_rate": 8.880879445844667e-07, "loss": 9.191812569042668e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 454.1875, "completions/min_length": 401.0, "epoch": 5.910294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.020719794556498528, "kl": 0.011273771640844643, "learning_rate": 8.880070157141692e-07, "loss": 0.00011104038276243955, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/mean_length": 484.0, "completions/min_length": 431.0, "epoch": 5.911764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.1340668201446533, "kl": 0.011153045343235135, "learning_rate": 8.879260612829516e-07, "loss": 0.00011070817708969116, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 469.3125, "completions/min_length": 384.0, "epoch": 5.913235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.6352277994155884, "kl": 0.016517639625817537, "learning_rate": 8.878450812961467e-07, "loss": 0.00016447901725769043, "reward": 0.5839166641235352, "reward_std": 0.34765625, "rewards/DrugCombAccuracyCOTORM/mean": 0.5762500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.49902406334877014, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.2291666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.9867174029350281, "step": 4021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 462.8125, "completions/min_length": 375.0, "epoch": 5.9147058823529415, "frac_reward_zero_std": 0.5, "grad_norm": 1.0435155630111694, "kl": 0.005658556707203388, "learning_rate": 8.877640757590894e-07, "loss": 5.631148815155029e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/mean_length": 512.3125, "completions/min_length": 384.0, "epoch": 5.916176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 1.4812723398208618, "kl": 0.008944395231083035, "learning_rate": 8.876830446771161e-07, "loss": 8.913874626159668e-05, "reward": 0.760425865650177, "reward_std": 0.27436596155166626, "rewards/DrugCombAccuracyCOTORM/mean": 0.7187615036964417, "rewards/DrugCombAccuracyCOTORM/std": 0.34746450185775757, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.2889423072338104, "step": 4023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 462.9375, "completions/min_length": 391.0, "epoch": 5.91764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.01681622490286827, "kl": 0.01033012242987752, "learning_rate": 8.876019880555648e-07, "loss": 0.00010256141104036942, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 458.5, "completions/min_length": 398.0, "epoch": 5.919117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.09838828444480896, "kl": 0.017018123995512724, "learning_rate": 8.875209058997757e-07, "loss": 0.00017246109200641513, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 442.75, "completions/min_length": 353.0, "epoch": 5.920588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.019061878323554993, "kl": 0.010609800228849053, "learning_rate": 8.874397982150897e-07, "loss": 0.00010727430344559252, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/mean_length": 497.375, "completions/min_length": 408.0, "epoch": 5.922058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.09298992156982422, "kl": 0.012232574983499944, "learning_rate": 8.873586650068505e-07, "loss": 0.00012192915892228484, "reward": 0.6410000324249268, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5824999809265137, "rewards/DrugCombAccuracyCOTORM/std": 0.43119215965270996, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 4027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/mean_length": 488.5625, "completions/min_length": 410.0, "epoch": 5.923529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9433404207229614, "kl": 0.009019372751936316, "learning_rate": 8.872775062804026e-07, "loss": 9.045453043654561e-05, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 4028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 487.1875, "completions/min_length": 432.0, "epoch": 5.925, "frac_reward_zero_std": 0.5, "grad_norm": 1.0145950317382812, "kl": 0.009889026870951056, "learning_rate": 8.871963220410927e-07, "loss": 9.825825691223145e-05, "reward": 0.9412500262260437, "reward_std": 0.06847400218248367, "rewards/DrugCombAccuracyCOTORM/mean": 0.949999988079071, "rewards/DrugCombAccuracyCOTORM/std": 0.08944271504878998, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5013870000839233, "step": 4029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 462.8125, "completions/min_length": 392.0, "epoch": 5.926470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.0548473596572876, "kl": 0.010822159703820944, "learning_rate": 8.87115112294269e-07, "loss": 0.00010857160668820143, "reward": 0.6520833373069763, "reward_std": 0.21756908297538757, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5208333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.7978559136390686, "step": 4030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 452.5625, "completions/min_length": 409.0, "epoch": 5.927941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.2749876976013184, "kl": 0.009716840693727136, "learning_rate": 8.870338770452814e-07, "loss": 9.752810001373291e-05, "reward": 0.643750011920929, "reward_std": 0.3442630469799042, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/mean_length": 488.4375, "completions/min_length": 370.0, "epoch": 5.929411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.8305962085723877, "kl": 0.013337055454030633, "learning_rate": 8.869526162994814e-07, "loss": 0.00013349950313568115, "reward": 0.23495832085609436, "reward_std": 0.15467225015163422, "rewards/DrugCombAccuracyCOTORM/mean": 0.07625000178813934, "rewards/DrugCombAccuracyCOTORM/std": 0.2523720860481262, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7395833730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5090613961219788, "step": 4032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 410.125, "completions/min_length": 337.0, "epoch": 5.930882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.1733181476593018, "kl": 0.01010243664495647, "learning_rate": 8.868713300622224e-07, "loss": 0.00010213075438514352, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/mean_length": 477.5, "completions/min_length": 372.0, "epoch": 5.932352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.8568191528320312, "kl": 0.00858704186975956, "learning_rate": 8.86790018338859e-07, "loss": 8.577853441238403e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 424.5, "completions/min_length": 369.0, "epoch": 5.9338235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.05107639357447624, "kl": 0.011900975368916988, "learning_rate": 8.867086811347482e-07, "loss": 0.00011860518134199083, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 484.875, "completions/min_length": 452.0, "epoch": 5.935294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.0272389650344849, "kl": 0.006955345976166427, "learning_rate": 8.866273184552482e-07, "loss": 6.92580797476694e-05, "reward": 0.5625, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 4036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 429.125, "completions/min_length": 342.0, "epoch": 5.936764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.960993230342865, "kl": 0.00794536795001477, "learning_rate": 8.865459303057188e-07, "loss": 7.878243923187256e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 514.25, "completions/min_length": 465.0, "epoch": 5.938235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.4021456241607666, "kl": 0.011095582041889429, "learning_rate": 8.864645166915217e-07, "loss": 0.00011048093438148499, "reward": 0.4479166865348816, "reward_std": 0.34905460476875305, "rewards/DrugCombAccuracyCOTORM/mean": 0.3333333432674408, "rewards/DrugCombAccuracyCOTORM/std": 0.4036867320537567, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 4038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 440.75, "completions/min_length": 329.0, "epoch": 5.939705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.17893154919147491, "kl": 0.013357974588871002, "learning_rate": 8.863830776180202e-07, "loss": 0.00013051333371549845, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 427.8125, "completions/min_length": 374.0, "epoch": 5.9411764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.02373911254107952, "kl": 0.010115477256476879, "learning_rate": 8.863016130905794e-07, "loss": 0.00010112416202900931, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 470.5, "completions/min_length": 376.0, "epoch": 5.942647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.01181465107947588, "kl": 0.007426823838613927, "learning_rate": 8.862201231145661e-07, "loss": 7.422514318022877e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/mean_length": 533.625, "completions/min_length": 442.0, "epoch": 5.944117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.7842389941215515, "kl": 0.007355501176789403, "learning_rate": 8.861386076953483e-07, "loss": 7.401211769320071e-05, "reward": 0.9754166603088379, "reward_std": 0.04552818834781647, "rewards/DrugCombAccuracyCOTORM/mean": 0.971875011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.07739239931106567, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 4042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 469.25, "completions/min_length": 419.0, "epoch": 5.945588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.0991450548171997, "kl": 0.010205986094661057, "learning_rate": 8.860570668382963e-07, "loss": 9.970460087060928e-05, "reward": 0.6074166297912598, "reward_std": 0.039867136627435684, "rewards/DrugCombAccuracyCOTORM/mean": 0.5274999737739563, "rewards/DrugCombAccuracyCOTORM/std": 0.49293002486228943, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.17078250646591187, "step": 4043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 450.125, "completions/min_length": 411.0, "epoch": 5.947058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.2176334857940674, "kl": 0.00971739471424371, "learning_rate": 8.859755005487817e-07, "loss": 9.831786155700684e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 4044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 428.3125, "completions/min_length": 396.0, "epoch": 5.948529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.023035096004605293, "kl": 0.008320674067363143, "learning_rate": 8.85893908832178e-07, "loss": 8.201340824598446e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 422.125, "completions/min_length": 373.0, "epoch": 5.95, "frac_reward_zero_std": 1.0, "grad_norm": 0.015229042619466782, "kl": 0.00881798635236919, "learning_rate": 8.858122916938599e-07, "loss": 8.831630839267746e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 440.0625, "completions/min_length": 396.0, "epoch": 5.951470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.2193199396133423, "kl": 0.013020656653679907, "learning_rate": 8.857306491392047e-07, "loss": 0.0001316244015470147, "reward": 0.6783333420753479, "reward_std": 0.2194003313779831, "rewards/DrugCombAccuracyCOTORM/mean": 0.6708333492279053, "rewards/DrugCombAccuracyCOTORM/std": 0.45246240496635437, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.873477578163147, "step": 4047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 438.4375, "completions/min_length": 409.0, "epoch": 5.952941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.01442397478967905, "kl": 0.009640756528824568, "learning_rate": 8.856489811735904e-07, "loss": 9.647153638070449e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 440.1875, "completions/min_length": 377.0, "epoch": 5.954411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.0240521430969238, "kl": 0.007565574487671256, "learning_rate": 8.85567287802397e-07, "loss": 7.577240467071533e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/mean_length": 494.6875, "completions/min_length": 419.0, "epoch": 5.955882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.9227874875068665, "kl": 0.009777341852895916, "learning_rate": 8.854855690310066e-07, "loss": 9.986013174057007e-05, "reward": 0.624500036239624, "reward_std": 0.13624709844589233, "rewards/DrugCombAccuracyCOTORM/mean": 0.5879166722297668, "rewards/DrugCombAccuracyCOTORM/std": 0.4598822593688965, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5416666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.7781745791435242, "step": 4050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 449.3125, "completions/min_length": 406.0, "epoch": 5.95735294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.528954267501831, "kl": 0.008481990545988083, "learning_rate": 8.854038248648025e-07, "loss": 8.450448513031006e-05, "reward": 0.5337797403335571, "reward_std": 0.30383169651031494, "rewards/DrugCombAccuracyCOTORM/mean": 0.4419642686843872, "rewards/DrugCombAccuracyCOTORM/std": 0.47966739535331726, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8020833730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5207499861717224, "step": 4051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 438.3125, "completions/min_length": 364.0, "epoch": 5.958823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.15283203125, "kl": 0.011927897110581398, "learning_rate": 8.853220553091695e-07, "loss": 0.00011743605136871338, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 4052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 460.5, "completions/min_length": 404.0, "epoch": 5.9602941176470585, "frac_reward_zero_std": 0.5, "grad_norm": 0.7832840085029602, "kl": 0.017889442620798945, "learning_rate": 8.852402603694949e-07, "loss": 0.00017352821305394173, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/mean_length": 478.125, "completions/min_length": 409.0, "epoch": 5.961764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0537478923797607, "kl": 0.009256675839424133, "learning_rate": 8.851584400511669e-07, "loss": 9.351968765258789e-05, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 4054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 455.5, "completions/min_length": 398.0, "epoch": 5.963235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0811619758605957, "kl": 0.010590239893645048, "learning_rate": 8.850765943595753e-07, "loss": 0.00010574938642093912, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 4055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 476.3125, "completions/min_length": 367.0, "epoch": 5.964705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.1863147020339966, "kl": 0.010558410547673702, "learning_rate": 8.849947233001124e-07, "loss": 0.00010653414210537449, "reward": 0.8767499923706055, "reward_std": 0.17010116577148438, "rewards/DrugCombAccuracyCOTORM/mean": 0.8537499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.31442803144454956, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.13437095284461975, "step": 4056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 425.0, "completions/min_length": 378.0, "epoch": 5.966176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.0533053874969482, "kl": 0.012436016113497317, "learning_rate": 8.849128268781714e-07, "loss": 0.00012607872486114502, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 4057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/mean_length": 408.25, "completions/min_length": 372.0, "epoch": 5.9676470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.010471471585333347, "kl": 0.00684199552051723, "learning_rate": 8.848309050991474e-07, "loss": 6.832007056800649e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/mean_length": 481.5625, "completions/min_length": 407.0, "epoch": 5.969117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.050998568534851, "kl": 0.008766651153564453, "learning_rate": 8.847489579684373e-07, "loss": 8.765608072280884e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 4059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 420.25, "completions/min_length": 363.0, "epoch": 5.970588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.007762670051306486, "kl": 0.006360715022310615, "learning_rate": 8.846669854914395e-07, "loss": 6.397598917828873e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 437.5, "completions/min_length": 404.0, "epoch": 5.972058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.02154131792485714, "kl": 0.007969961850903928, "learning_rate": 8.845849876735541e-07, "loss": 8.007825090317056e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/mean_length": 469.25, "completions/min_length": 384.0, "epoch": 5.973529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.1118439435958862, "kl": 0.013925747945904732, "learning_rate": 8.845029645201831e-07, "loss": 0.0001390127872582525, "reward": 0.6768749952316284, "reward_std": 0.08401449024677277, "rewards/DrugCombAccuracyCOTORM/mean": 0.6214843988418579, "rewards/DrugCombAccuracyCOTORM/std": 0.40821394324302673, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.2713136672973633, "step": 4062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 455.4375, "completions/min_length": 387.0, "epoch": 5.975, "frac_reward_zero_std": 0.5, "grad_norm": 1.0874903202056885, "kl": 0.010251478874124587, "learning_rate": 8.844209160367298e-07, "loss": 0.00010409022070234641, "reward": 0.3500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 4063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 451.375, "completions/min_length": 420.0, "epoch": 5.976470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.01814587228000164, "kl": 0.008839027024805546, "learning_rate": 8.843388422285993e-07, "loss": 8.821735536912456e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/mean_length": 482.875, "completions/min_length": 324.0, "epoch": 5.977941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.8476442694664001, "kl": 0.008336653932929039, "learning_rate": 8.842567431011987e-07, "loss": 8.511071791872382e-05, "reward": 0.8743250370025635, "reward_std": 0.1897018402814865, "rewards/DrugCombAccuracyCOTORM/mean": 0.8499374985694885, "rewards/DrugCombAccuracyCOTORM/std": 0.346458375453949, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9437500238418579, "rewards/DrugCombCoverageCOTORM/std": 0.17876894772052765, "step": 4065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 440.625, "completions/min_length": 379.0, "epoch": 5.979411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9943146705627441, "kl": 0.009710046229884028, "learning_rate": 8.841746186599362e-07, "loss": 9.739026427268982e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 491.3125, "completions/min_length": 429.0, "epoch": 5.980882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.2837979793548584, "kl": 0.012561919400468469, "learning_rate": 8.840924689102221e-07, "loss": 0.00012525171041488647, "reward": 0.42762255668640137, "reward_std": 0.041373543441295624, "rewards/DrugCombAccuracyCOTORM/mean": 0.3683823347091675, "rewards/DrugCombAccuracyCOTORM/std": 0.3632603585720062, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3291666507720947, "rewards/DrugCombCoverageCOTORM/std": 0.572049617767334, "step": 4067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 453.0625, "completions/min_length": 362.0, "epoch": 5.982352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.16789399087429047, "kl": 0.007953982800245285, "learning_rate": 8.840102938574681e-07, "loss": 8.002344839042053e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 457.9375, "completions/min_length": 404.0, "epoch": 5.983823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.0750744342803955, "kl": 0.01231595897115767, "learning_rate": 8.839280935070877e-07, "loss": 0.00012263751705177128, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 436.6875, "completions/min_length": 377.0, "epoch": 5.985294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.01031523197889328, "kl": 0.0062030155677348375, "learning_rate": 8.838458678644962e-07, "loss": 6.196712638484314e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/mean_length": 473.375, "completions/min_length": 395.0, "epoch": 5.9867647058823525, "frac_reward_zero_std": 0.0, "grad_norm": 1.6614820957183838, "kl": 0.011053352849557996, "learning_rate": 8.837636169351102e-07, "loss": 0.00011014193296432495, "reward": 0.890625, "reward_std": 0.2233666181564331, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.2687419056892395, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.20155644416809082, "step": 4071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 441.625, "completions/min_length": 382.0, "epoch": 5.988235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8677917122840881, "kl": 0.010538973263464868, "learning_rate": 8.836813407243485e-07, "loss": 0.00010366307105869055, "reward": 0.3499999940395355, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.1875, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/mean_length": 416.4375, "completions/min_length": 358.0, "epoch": 5.989705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.016191501170396805, "kl": 0.009301392361521721, "learning_rate": 8.835990392376309e-07, "loss": 9.316897194366902e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 445.25, "completions/min_length": 420.0, "epoch": 5.991176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.016122370958328247, "kl": 0.009073521941900253, "learning_rate": 8.835167124803794e-07, "loss": 9.044494072441012e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 408.625, "completions/min_length": 356.0, "epoch": 5.992647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.01583423651754856, "kl": 0.009823973989114165, "learning_rate": 8.834343604580174e-07, "loss": 9.856641554506496e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 463.0625, "completions/min_length": 402.0, "epoch": 5.9941176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.1163426637649536, "kl": 0.00841699750162661, "learning_rate": 8.833519831759701e-07, "loss": 8.476525545120239e-05, "reward": 0.723437488079071, "reward_std": 0.3836613893508911, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 4076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/mean_length": 566.6875, "completions/min_length": 466.0, "epoch": 5.995588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.4015963077545166, "kl": 0.023349259281530976, "learning_rate": 8.832695806396644e-07, "loss": 0.00024933740496635437, "reward": 0.5095769166946411, "reward_std": 0.3505703806877136, "rewards/DrugCombAccuracyCOTORM/mean": 0.4312419295310974, "rewards/DrugCombAccuracyCOTORM/std": 0.3782377243041992, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6458333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.6718547940254211, "step": 4077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 440.0, "completions/min_length": 372.0, "epoch": 5.997058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.010520944371819496, "kl": 0.008674738579429686, "learning_rate": 8.831871528545285e-07, "loss": 8.745405648369342e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 4078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/mean_length": 543.0, "completions/min_length": 442.0, "epoch": 5.998529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.683952808380127, "kl": 0.006170028704218566, "learning_rate": 8.831046998259928e-07, "loss": 6.198137998580933e-05, "reward": 0.7987500429153442, "reward_std": 0.21517018973827362, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.987500011920929, "rewards/DrugCombCoverageCOTORM/std": 0.05000000074505806, "step": 4079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/mean_length": 474.875, "completions/min_length": 364.0, "epoch": 6.0, "frac_reward_zero_std": 0.5, "grad_norm": 0.9137528538703918, "kl": 0.00884101481642574, "learning_rate": 8.83022221559489e-07, "loss": 8.941441774368286e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/mean_length": 484.1875, "completions/min_length": 359.0, "epoch": 6.001470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.013167262077331543, "kl": 0.0070301840314641595, "learning_rate": 8.829397180604505e-07, "loss": 7.020158955128863e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 4081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 427.0, "completions/min_length": 374.0, "epoch": 6.002941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.1828895807266235, "kl": 0.008600785164162517, "learning_rate": 8.828571893343125e-07, "loss": 8.594244718551636e-05, "reward": 0.637499988079071, "reward_std": 0.3489062786102295, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 4082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 469.1875, "completions/min_length": 422.0, "epoch": 6.004411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.017297804355621338, "kl": 0.008938809274695814, "learning_rate": 8.827746353865118e-07, "loss": 8.959471597336233e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 449.5, "completions/min_length": 374.0, "epoch": 6.0058823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.016135014593601227, "kl": 0.00824143411591649, "learning_rate": 8.826920562224867e-07, "loss": 8.284240175271407e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 450.4375, "completions/min_length": 372.0, "epoch": 6.007352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.056699905544519424, "kl": 0.01077460590749979, "learning_rate": 8.826094518476774e-07, "loss": 0.00010742438462330028, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 421.625, "completions/min_length": 378.0, "epoch": 6.008823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.011126644909381866, "kl": 0.00847597373649478, "learning_rate": 8.825268222675257e-07, "loss": 8.39963904581964e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/mean_length": 470.0625, "completions/min_length": 378.0, "epoch": 6.010294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.009450638666749, "kl": 0.007651847903616726, "learning_rate": 8.824441674874752e-07, "loss": 7.68779864301905e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 454.125, "completions/min_length": 402.0, "epoch": 6.011764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.22660326957702637, "kl": 0.011810484807938337, "learning_rate": 8.823614875129705e-07, "loss": 0.00011839163926197216, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 460.6875, "completions/min_length": 398.0, "epoch": 6.0132352941176475, "frac_reward_zero_std": 0.5, "grad_norm": 1.113772988319397, "kl": 0.007272214163094759, "learning_rate": 8.822787823494588e-07, "loss": 7.243642176035792e-05, "reward": 0.7437499761581421, "reward_std": 0.1989930123090744, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 398.75, "completions/min_length": 295.0, "epoch": 6.014705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.01730172708630562, "kl": 0.009282266488298774, "learning_rate": 8.821960520023883e-07, "loss": 9.260026854462922e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 454.25, "completions/min_length": 405.0, "epoch": 6.016176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.0020467042922974, "kl": 0.008892473299056292, "learning_rate": 8.821132964772092e-07, "loss": 8.890029857866466e-05, "reward": 0.824999988079071, "reward_std": 0.24348658323287964, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 4091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 455.875, "completions/min_length": 399.0, "epoch": 6.017647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.8934252262115479, "kl": 0.01027089566923678, "learning_rate": 8.82030515779373e-07, "loss": 0.0001030416096909903, "reward": 0.7250000238418579, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 4092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 440.0625, "completions/min_length": 346.0, "epoch": 6.019117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.014459196478128433, "kl": 0.008938855957239866, "learning_rate": 8.819477099143333e-07, "loss": 8.967699977802113e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 444.375, "completions/min_length": 395.0, "epoch": 6.020588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.013111978769302368, "kl": 0.008026443887501955, "learning_rate": 8.818648788875451e-07, "loss": 8.067923772614449e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 469.5625, "completions/min_length": 371.0, "epoch": 6.022058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.2101519107818604, "kl": 0.014403015840798616, "learning_rate": 8.81782022704465e-07, "loss": 0.00014386471593752503, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 459.875, "completions/min_length": 411.0, "epoch": 6.023529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.015447158366441727, "kl": 0.010515162837691605, "learning_rate": 8.816991413705514e-07, "loss": 0.00010526958794798702, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 451.5, "completions/min_length": 397.0, "epoch": 6.025, "frac_reward_zero_std": 1.0, "grad_norm": 0.009795475751161575, "kl": 0.006776979425922036, "learning_rate": 8.816162348912643e-07, "loss": 6.740314711350948e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 439.75, "completions/min_length": 401.0, "epoch": 6.026470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 2.137188196182251, "kl": 0.009800876257941127, "learning_rate": 8.815333032720656e-07, "loss": 9.741261601448059e-05, "reward": 0.6937500238418579, "reward_std": 0.36611872911453247, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 420.3125, "completions/min_length": 404.0, "epoch": 6.027941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.020893944427371025, "kl": 0.008131177863106132, "learning_rate": 8.814503465184182e-07, "loss": 8.131776121445e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 466.8125, "completions/min_length": 404.0, "epoch": 6.029411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.8743654489517212, "kl": 0.008288030745461583, "learning_rate": 8.813673646357873e-07, "loss": 8.234381675720215e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 383.375, "completions/min_length": 348.0, "epoch": 6.030882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.9618611931800842, "kl": 0.006823491421528161, "learning_rate": 8.812843576296395e-07, "loss": 6.832927465438843e-05, "reward": 0.512499988079071, "reward_std": 0.0353553369641304, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 4101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 434.0625, "completions/min_length": 382.0, "epoch": 6.0323529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.8199151754379272, "kl": 0.009809157811105251, "learning_rate": 8.812013255054434e-07, "loss": 9.792856872081757e-05, "reward": 0.687333345413208, "reward_std": 0.16396206617355347, "rewards/DrugCombAccuracyCOTORM/mean": 0.60916668176651, "rewards/DrugCombAccuracyCOTORM/std": 0.49126818776130676, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 437.6875, "completions/min_length": 388.0, "epoch": 6.033823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.8096976280212402, "kl": 0.00793278741184622, "learning_rate": 8.811182682686684e-07, "loss": 7.977352652233094e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 462.3125, "completions/min_length": 421.0, "epoch": 6.035294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.579069972038269, "kl": 0.009163575829006732, "learning_rate": 8.810351859247866e-07, "loss": 9.164959192276001e-05, "reward": 0.7437499761581421, "reward_std": 0.3729080259799957, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 450.5625, "completions/min_length": 359.0, "epoch": 6.036764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.01747921295464039, "kl": 0.010895449784584343, "learning_rate": 8.809520784792709e-07, "loss": 0.00010794540139613673, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/mean_length": 493.3125, "completions/min_length": 419.0, "epoch": 6.038235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.03387797623872757, "kl": 0.009258068981580436, "learning_rate": 8.808689459375964e-07, "loss": 9.208712435793132e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 4106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/mean_length": 486.0625, "completions/min_length": 422.0, "epoch": 6.0397058823529415, "frac_reward_zero_std": 1.0, "grad_norm": 0.00963064655661583, "kl": 0.007837961078621447, "learning_rate": 8.807857883052396e-07, "loss": 7.813251431798562e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 448.375, "completions/min_length": 391.0, "epoch": 6.041176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 1.3191659450531006, "kl": 0.012631701538339257, "learning_rate": 8.807026055876786e-07, "loss": 0.00012670457363128662, "reward": 0.45000001788139343, "reward_std": 0.3552303612232208, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 437.5, "completions/min_length": 341.0, "epoch": 6.04264705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.6095229387283325, "kl": 0.010608592303469777, "learning_rate": 8.806193977903935e-07, "loss": 0.00010563433170318604, "reward": 0.42141667008399963, "reward_std": 0.24573948979377747, "rewards/DrugCombAccuracyCOTORM/mean": 0.38874998688697815, "rewards/DrugCombAccuracyCOTORM/std": 0.4919603765010834, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.10416668653488159, "rewards/DrugCombCoverageCOTORM/std": 1.0089874267578125, "step": 4109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 438.0, "completions/min_length": 349.0, "epoch": 6.044117647058823, "frac_reward_zero_std": 0.0, "grad_norm": 1.4602051973342896, "kl": 0.012131635565310717, "learning_rate": 8.805361649188657e-07, "loss": 0.00012239813804626465, "reward": 0.5642222166061401, "reward_std": 0.33777284622192383, "rewards/DrugCombAccuracyCOTORM/mean": 0.5273264050483704, "rewards/DrugCombAccuracyCOTORM/std": 0.4994659423828125, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4236111044883728, "rewards/DrugCombCoverageCOTORM/std": 0.8844507932662964, "step": 4110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 426.4375, "completions/min_length": 368.0, "epoch": 6.045588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.918590247631073, "kl": 0.012542822863906622, "learning_rate": 8.804529069785783e-07, "loss": 0.00012328929733484983, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 452.75, "completions/min_length": 371.0, "epoch": 6.047058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.7045954465866089, "kl": 0.02021230338141322, "learning_rate": 8.803696239750161e-07, "loss": 0.00019810348749160767, "reward": 0.7875000238418579, "reward_std": 0.3837963938713074, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/mean_length": 453.75, "completions/min_length": 382.0, "epoch": 6.048529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9939516186714172, "kl": 0.008799546980299056, "learning_rate": 8.802863159136658e-07, "loss": 8.785023965174332e-05, "reward": 0.7298274040222168, "reward_std": 0.22083263099193573, "rewards/DrugCombAccuracyCOTORM/mean": 0.7156696319580078, "rewards/DrugCombAccuracyCOTORM/std": 0.4407515525817871, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5729166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.704400897026062, "step": 4113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 435.125, "completions/min_length": 383.0, "epoch": 6.05, "frac_reward_zero_std": 0.5, "grad_norm": 0.9375252723693848, "kl": 0.010143725201487541, "learning_rate": 8.802029828000155e-07, "loss": 0.00010139346704818308, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 4114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 442.125, "completions/min_length": 373.0, "epoch": 6.051470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.921475350856781, "kl": 0.010159149765968323, "learning_rate": 8.801196246395546e-07, "loss": 0.0001020282506942749, "reward": 0.824999988079071, "reward_std": 0.24348658323287964, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 4115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/mean_length": 536.25, "completions/min_length": 453.0, "epoch": 6.052941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9596678614616394, "kl": 0.008998248958960176, "learning_rate": 8.800362414377751e-07, "loss": 9.009242057800293e-05, "reward": 0.8999999761581421, "reward_std": 0.10690448433160782, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.22360680997371674, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 444.0625, "completions/min_length": 402.0, "epoch": 6.054411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.028404848650097847, "kl": 0.011964299716055393, "learning_rate": 8.799528332001696e-07, "loss": 0.00011991037172265351, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/mean_length": 486.8125, "completions/min_length": 404.0, "epoch": 6.055882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.9153934717178345, "kl": 0.008814989821985364, "learning_rate": 8.798693999322328e-07, "loss": 8.7442145741079e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 4118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 467.375, "completions/min_length": 413.0, "epoch": 6.057352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.9617024660110474, "kl": 0.012166795670054853, "learning_rate": 8.797859416394617e-07, "loss": 0.00012187566608190536, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/mean_length": 486.0625, "completions/min_length": 423.0, "epoch": 6.0588235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 0.9269594550132751, "kl": 0.011335544986650348, "learning_rate": 8.797024583273536e-07, "loss": 0.00011395060573704541, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 4120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 451.6875, "completions/min_length": 372.0, "epoch": 6.060294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.02644614316523075, "kl": 0.010148578323423862, "learning_rate": 8.796189500014084e-07, "loss": 0.00010254295921185985, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 431.4375, "completions/min_length": 331.0, "epoch": 6.061764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9238130450248718, "kl": 0.009187367279082537, "learning_rate": 8.795354166671276e-07, "loss": 9.128451347351074e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 453.0, "completions/min_length": 404.0, "epoch": 6.063235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.010588601231575012, "kl": 0.006679239100776613, "learning_rate": 8.79451858330014e-07, "loss": 6.662023952230811e-05, "reward": 0.6410000324249268, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5824999809265137, "rewards/DrugCombAccuracyCOTORM/std": 0.43119215965270996, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 4123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.0, "completions/mean_length": 523.375, "completions/min_length": 416.0, "epoch": 6.064705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 1.1236366033554077, "kl": 0.011307945707812905, "learning_rate": 8.793682749955722e-07, "loss": 0.00011233240365982056, "reward": 0.6374870538711548, "reward_std": 0.3146308660507202, "rewards/DrugCombAccuracyCOTORM/mean": 0.5663900971412659, "rewards/DrugCombAccuracyCOTORM/std": 0.4210628271102905, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.31262773275375366, "step": 4124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/mean_length": 402.0, "completions/min_length": 384.0, "epoch": 6.0661764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.01232500746846199, "kl": 0.0072414695750921965, "learning_rate": 8.792846666693086e-07, "loss": 7.293812814168632e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 435.375, "completions/min_length": 396.0, "epoch": 6.067647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.010884277522563934, "kl": 0.00859966641291976, "learning_rate": 8.79201033356731e-07, "loss": 8.596485713496804e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 421.5, "completions/min_length": 369.0, "epoch": 6.069117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.013269318267703056, "kl": 0.006385566550306976, "learning_rate": 8.79117375063349e-07, "loss": 6.33887539152056e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 453.625, "completions/min_length": 389.0, "epoch": 6.070588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.2650117874145508, "kl": 0.012792987283319235, "learning_rate": 8.790336917946736e-07, "loss": 0.0001280723954550922, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 400.25, "completions/min_length": 320.0, "epoch": 6.072058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.020701050758361816, "kl": 0.010935303987935185, "learning_rate": 8.789499835562177e-07, "loss": 0.00011005112901329994, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 448.25, "completions/min_length": 373.0, "epoch": 6.073529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.00954941101372242, "kl": 0.00843359180726111, "learning_rate": 8.788662503534961e-07, "loss": 8.38609121274203e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 4130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 446.875, "completions/min_length": 390.0, "epoch": 6.075, "frac_reward_zero_std": 0.5, "grad_norm": 1.1546651124954224, "kl": 0.01037491241004318, "learning_rate": 8.787824921920248e-07, "loss": 0.00010324659524485469, "reward": 0.75, "reward_std": 0.20701967179775238, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 430.1875, "completions/min_length": 394.0, "epoch": 6.076470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.10762999206781387, "kl": 0.010908949421718717, "learning_rate": 8.786987090773213e-07, "loss": 0.00010864148498512805, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 455.875, "completions/min_length": 390.0, "epoch": 6.077941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.2276017665863037, "kl": 0.008819438284263015, "learning_rate": 8.786149010149053e-07, "loss": 8.861775131663308e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 481.1875, "completions/min_length": 420.0, "epoch": 6.079411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.1806015968322754, "kl": 0.010264205862767994, "learning_rate": 8.785310680102978e-07, "loss": 0.00010199755342910066, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 467.125, "completions/min_length": 428.0, "epoch": 6.080882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.01897568628191948, "kl": 0.007423662347719073, "learning_rate": 8.784472100690214e-07, "loss": 7.471586286555976e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/mean_length": 463.8125, "completions/min_length": 375.0, "epoch": 6.08235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.012441454455256462, "kl": 0.007832105155102909, "learning_rate": 8.783633271966007e-07, "loss": 7.809622911736369e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 473.4375, "completions/min_length": 400.0, "epoch": 6.083823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.05846993997693062, "kl": 0.013038731412962079, "learning_rate": 8.782794193985613e-07, "loss": 0.00013179874804336578, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 464.0, "completions/min_length": 412.0, "epoch": 6.0852941176470585, "frac_reward_zero_std": 1.0, "grad_norm": 0.009412547573447227, "kl": 0.007089312886819243, "learning_rate": 8.781954866804311e-07, "loss": 7.111750892363489e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 435.1875, "completions/min_length": 412.0, "epoch": 6.086764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.011645507998764515, "kl": 0.007182489731349051, "learning_rate": 8.781115290477394e-07, "loss": 7.147343421820551e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 443.5625, "completions/min_length": 379.0, "epoch": 6.088235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.2914565801620483, "kl": 0.012868210906162858, "learning_rate": 8.780275465060171e-07, "loss": 0.00012643635272979736, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 456.125, "completions/min_length": 358.0, "epoch": 6.089705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.02900473028421402, "kl": 0.009787904797121882, "learning_rate": 8.779435390607967e-07, "loss": 9.804870933294296e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 451.4375, "completions/min_length": 393.0, "epoch": 6.091176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.04778676480054855, "kl": 0.011518726823851466, "learning_rate": 8.778595067176122e-07, "loss": 0.00011574341624509543, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 457.8125, "completions/min_length": 390.0, "epoch": 6.0926470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.02115998975932598, "kl": 0.008283879375085235, "learning_rate": 8.777754494819998e-07, "loss": 8.228297519963235e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 431.4375, "completions/min_length": 361.0, "epoch": 6.094117647058823, "frac_reward_zero_std": 0.0, "grad_norm": 1.345038652420044, "kl": 0.012572165112942457, "learning_rate": 8.776913673594968e-07, "loss": 0.0001249164342880249, "reward": 0.8464166522026062, "reward_std": 0.32451075315475464, "rewards/DrugCombAccuracyCOTORM/mean": 0.8262500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.3764195442199707, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.3435921370983124, "step": 4144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 439.75, "completions/min_length": 391.0, "epoch": 6.095588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.027514202520251274, "kl": 0.010520925861783326, "learning_rate": 8.776072603556424e-07, "loss": 0.00010428459063405171, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 424.625, "completions/min_length": 370.0, "epoch": 6.097058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 0.9973896145820618, "kl": 0.008506265003234148, "learning_rate": 8.775231284759773e-07, "loss": 8.49401913001202e-05, "reward": 0.4000000059604645, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 4146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/mean_length": 391.6875, "completions/min_length": 344.0, "epoch": 6.098529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.009141474030911922, "kl": 0.006467546918429434, "learning_rate": 8.774389717260438e-07, "loss": 6.444245809689164e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 434.6875, "completions/min_length": 381.0, "epoch": 6.1, "frac_reward_zero_std": 1.0, "grad_norm": 0.03621511906385422, "kl": 0.009690440259873867, "learning_rate": 8.773547901113861e-07, "loss": 9.748773300088942e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/mean_length": 422.375, "completions/min_length": 395.0, "epoch": 6.101470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9875614643096924, "kl": 0.010555071057751775, "learning_rate": 8.772705836375496e-07, "loss": 0.00010525991820031777, "reward": 0.5874999761581421, "reward_std": 0.172688826918602, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 4149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 470.0, "completions/min_length": 414.0, "epoch": 6.102941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.016994645819067955, "kl": 0.008669972768984735, "learning_rate": 8.77186352310082e-07, "loss": 8.646777860121801e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 393.375, "completions/min_length": 328.0, "epoch": 6.104411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 4.985409736633301, "kl": 0.008085580193437636, "learning_rate": 8.771020961345319e-07, "loss": 8.06190146249719e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 449.375, "completions/min_length": 384.0, "epoch": 6.105882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.1086125373840332, "kl": 0.009105936624109745, "learning_rate": 8.770178151164503e-07, "loss": 9.066325583262369e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 498.3125, "completions/min_length": 431.0, "epoch": 6.107352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.8159562945365906, "kl": 0.010117153404280543, "learning_rate": 8.769335092613888e-07, "loss": 0.00010088831186294556, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 4153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 439.4375, "completions/min_length": 400.0, "epoch": 6.108823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.012102605774998665, "kl": 0.0068294486263766885, "learning_rate": 8.768491785749017e-07, "loss": 6.804520671721548e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 442.875, "completions/min_length": 386.0, "epoch": 6.110294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.1001111268997192, "kl": 0.008461104473099113, "learning_rate": 8.767648230625446e-07, "loss": 8.48378986120224e-05, "reward": 0.887499988079071, "reward_std": 0.21001701056957245, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 447.125, "completions/min_length": 404.0, "epoch": 6.1117647058823525, "frac_reward_zero_std": 0.5, "grad_norm": 1.047795057296753, "kl": 0.010130928596481681, "learning_rate": 8.766804427298743e-07, "loss": 0.00010074302554130554, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 4156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/mean_length": 398.625, "completions/min_length": 357.0, "epoch": 6.113235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.038445595651865005, "kl": 0.010794098838232458, "learning_rate": 8.765960375824497e-07, "loss": 0.00010713938536355272, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 461.625, "completions/min_length": 376.0, "epoch": 6.114705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.017379870638251305, "kl": 0.00905791379045695, "learning_rate": 8.765116076258312e-07, "loss": 9.062520257430151e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/mean_length": 511.6875, "completions/min_length": 446.0, "epoch": 6.116176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 1.4241721630096436, "kl": 0.00913166522514075, "learning_rate": 8.764271528655809e-07, "loss": 9.180605411529541e-05, "reward": 0.6824896335601807, "reward_std": 0.2780511975288391, "rewards/DrugCombAccuracyCOTORM/mean": 0.611901044845581, "rewards/DrugCombAccuracyCOTORM/std": 0.37351861596107483, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9296875, "rewards/DrugCombCoverageCOTORM/std": 0.12884704768657684, "step": 4159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/mean_length": 458.125, "completions/min_length": 381.0, "epoch": 6.117647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.0019948482513428, "kl": 0.010759674711152911, "learning_rate": 8.763426733072622e-07, "loss": 0.00010767296771518886, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 462.8125, "completions/min_length": 408.0, "epoch": 6.1191176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.019525274634361267, "kl": 0.011429046047851443, "learning_rate": 8.762581689564408e-07, "loss": 0.00011424973490647972, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 392.9375, "completions/min_length": 329.0, "epoch": 6.120588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.021329980343580246, "kl": 0.009574801428243518, "learning_rate": 8.761736398186832e-07, "loss": 9.588460670784116e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 428.125, "completions/min_length": 370.0, "epoch": 6.122058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.013426778838038445, "kl": 0.006708120694383979, "learning_rate": 8.760890858995582e-07, "loss": 6.702963582938537e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 476.3125, "completions/min_length": 391.0, "epoch": 6.123529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.0265077352523804, "kl": 0.010553386295214295, "learning_rate": 8.760045072046363e-07, "loss": 0.0001054922686307691, "reward": 0.784375011920929, "reward_std": 0.14429889619350433, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.34960293769836426, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.3010398745536804, "step": 4164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 440.1875, "completions/min_length": 387.0, "epoch": 6.125, "frac_reward_zero_std": 1.0, "grad_norm": 0.015781015157699585, "kl": 0.006690120091661811, "learning_rate": 8.759199037394886e-07, "loss": 6.718990334775299e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 456.4375, "completions/min_length": 395.0, "epoch": 6.126470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.011358235031366348, "kl": 0.01060747355222702, "learning_rate": 8.758352755096892e-07, "loss": 0.00010403485794086009, "reward": 0.6713333129882812, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6100000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.40279027819633484, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 4166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 456.25, "completions/min_length": 417.0, "epoch": 6.127941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.2929421663284302, "kl": 0.007883924758061767, "learning_rate": 8.75750622520813e-07, "loss": 7.868558168411255e-05, "reward": 0.9833333492279053, "reward_std": 0.047140445560216904, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 435.4375, "completions/min_length": 342.0, "epoch": 6.129411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.009243274107575417, "kl": 0.007391303079202771, "learning_rate": 8.756659447784367e-07, "loss": 7.487158291041851e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 447.0625, "completions/min_length": 349.0, "epoch": 6.1308823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.8203220367431641, "kl": 0.007600125041790307, "learning_rate": 8.755812422881385e-07, "loss": 7.606221333844587e-05, "reward": 0.7534999847412109, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.7074999809265137, "rewards/DrugCombAccuracyCOTORM/std": 0.39000001549720764, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.1666666567325592, "step": 4169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 414.3125, "completions/min_length": 357.0, "epoch": 6.132352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.013238521292805672, "kl": 0.008808458456769586, "learning_rate": 8.754965150554987e-07, "loss": 8.827220153762028e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 490.75, "completions/min_length": 412.0, "epoch": 6.133823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.01845596916973591, "kl": 0.00941854645498097, "learning_rate": 8.754117630860987e-07, "loss": 9.440004214411601e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 450.75, "completions/min_length": 388.0, "epoch": 6.135294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.020360229536890984, "kl": 0.00751106848474592, "learning_rate": 8.753269863855217e-07, "loss": 7.524705870309845e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 473.3125, "completions/min_length": 432.0, "epoch": 6.136764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8429387211799622, "kl": 0.012464741943404078, "learning_rate": 8.752421849593527e-07, "loss": 0.0001241937279701233, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 479.625, "completions/min_length": 391.0, "epoch": 6.1382352941176475, "frac_reward_zero_std": 1.0, "grad_norm": 0.011950045824050903, "kl": 0.008044863352552056, "learning_rate": 8.751573588131782e-07, "loss": 8.12043872429058e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 421.1875, "completions/min_length": 321.0, "epoch": 6.139705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.016112908720970154, "kl": 0.0085024475120008, "learning_rate": 8.750725079525864e-07, "loss": 8.553337829653174e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 463.875, "completions/min_length": 428.0, "epoch": 6.141176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.02354065142571926, "kl": 0.009543424705043435, "learning_rate": 8.749876323831669e-07, "loss": 9.604338993085548e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/mean_length": 479.375, "completions/min_length": 378.0, "epoch": 6.142647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.8858002424240112, "kl": 0.007694646250456572, "learning_rate": 8.749027321105112e-07, "loss": 7.635050860699266e-05, "reward": 0.5966249704360962, "reward_std": 0.03770539164543152, "rewards/DrugCombAccuracyCOTORM/mean": 0.5309374928474426, "rewards/DrugCombAccuracyCOTORM/std": 0.48794543743133545, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.71875, "rewards/DrugCombCoverageCOTORM/std": 0.3145764470100403, "step": 4177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 467.9375, "completions/min_length": 382.0, "epoch": 6.144117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.8548950552940369, "kl": 0.008398589096032083, "learning_rate": 8.748178071402121e-07, "loss": 8.3784936578013e-05, "reward": 0.8614583611488342, "reward_std": 0.055979274213314056, "rewards/DrugCombAccuracyCOTORM/mean": 0.8541666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.17078250646591187, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.78125, "rewards/DrugCombCoverageCOTORM/std": 0.2561737895011902, "step": 4178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/mean_length": 486.25, "completions/min_length": 351.0, "epoch": 6.145588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.2318413257598877, "kl": 0.012652970501221716, "learning_rate": 8.747328574778645e-07, "loss": 0.00012622054782696068, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 446.0, "completions/min_length": 406.0, "epoch": 6.147058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.028272312134504318, "kl": 0.01057740137912333, "learning_rate": 8.746478831290646e-07, "loss": 0.00010489372652955353, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 413.125, "completions/min_length": 365.0, "epoch": 6.148529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.12306789308786392, "kl": 0.01398662501014769, "learning_rate": 8.745628840994102e-07, "loss": 0.00014470660244114697, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 471.625, "completions/min_length": 408.0, "epoch": 6.15, "frac_reward_zero_std": 1.0, "grad_norm": 0.06412104517221451, "kl": 0.01204103883355856, "learning_rate": 8.744778603945011e-07, "loss": 0.00012126781803090125, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 476.0, "completions/min_length": 378.0, "epoch": 6.151470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.1006377935409546, "kl": 0.007727353135123849, "learning_rate": 8.74392812019938e-07, "loss": 7.753843237878755e-05, "reward": 0.9121952652931213, "reward_std": 0.12130722403526306, "rewards/DrugCombAccuracyCOTORM/mean": 0.8975357413291931, "rewards/DrugCombAccuracyCOTORM/std": 0.2171134501695633, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9416666626930237, "rewards/DrugCombCoverageCOTORM/std": 0.1666666567325592, "step": 4183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/mean_length": 489.5625, "completions/min_length": 413.0, "epoch": 6.152941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 0.8727500438690186, "kl": 0.008911786018870771, "learning_rate": 8.74307738981324e-07, "loss": 8.979120320873335e-05, "reward": 0.9551249742507935, "reward_std": 0.12692566215991974, "rewards/DrugCombAccuracyCOTORM/mean": 0.9478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.20874999463558197, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 4184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 453.0625, "completions/min_length": 364.0, "epoch": 6.154411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.0025198459625244, "kl": 0.009142575203441083, "learning_rate": 8.742226412842635e-07, "loss": 9.165645315079018e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 4185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 492.5625, "completions/min_length": 452.0, "epoch": 6.155882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.965536892414093, "kl": 0.01356800552457571, "learning_rate": 8.741375189343624e-07, "loss": 0.0001371433463646099, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/mean_length": 499.5, "completions/min_length": 461.0, "epoch": 6.1573529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.021224573254585266, "kl": 0.008713182993233204, "learning_rate": 8.740523719372283e-07, "loss": 8.785815589362755e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/mean_length": 497.25, "completions/min_length": 394.0, "epoch": 6.158823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.015418661758303642, "kl": 0.00778296566568315, "learning_rate": 8.739672002984706e-07, "loss": 7.830978574929759e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 417.875, "completions/min_length": 314.0, "epoch": 6.160294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.023871272802352905, "kl": 0.009483160101808608, "learning_rate": 8.738820040237001e-07, "loss": 9.480002336204052e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 477.375, "completions/min_length": 415.0, "epoch": 6.161764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.017233233898878098, "kl": 0.008176537929102778, "learning_rate": 8.737967831185294e-07, "loss": 8.170495857484639e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 434.0625, "completions/min_length": 335.0, "epoch": 6.163235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.011732907965779305, "kl": 0.009396182489581406, "learning_rate": 8.737115375885726e-07, "loss": 9.390168997924775e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 456.0, "completions/min_length": 333.0, "epoch": 6.1647058823529415, "frac_reward_zero_std": 0.0, "grad_norm": 1.3173885345458984, "kl": 0.010138960788026452, "learning_rate": 8.736262674394454e-07, "loss": 0.00010117143392562866, "reward": 0.71875, "reward_std": 0.39963412284851074, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 4192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 417.5, "completions/min_length": 355.0, "epoch": 6.166176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.012826676480472088, "kl": 0.007243069703690708, "learning_rate": 8.735409726767653e-07, "loss": 7.234563236124814e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 414.4375, "completions/min_length": 347.0, "epoch": 6.16764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.011333960108458996, "kl": 0.009087118552997708, "learning_rate": 8.734556533061512e-07, "loss": 9.094717097468674e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 420.625, "completions/min_length": 392.0, "epoch": 6.169117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.8833463788032532, "kl": 0.008264539181254804, "learning_rate": 8.733703093332237e-07, "loss": 8.262693881988525e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 463.25, "completions/min_length": 401.0, "epoch": 6.170588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.1815099716186523, "kl": 0.00971845118328929, "learning_rate": 8.732849407636051e-07, "loss": 9.767866868060082e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 453.5, "completions/min_length": 410.0, "epoch": 6.172058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.2807388305664062, "kl": 0.01246645301580429, "learning_rate": 8.731995476029194e-07, "loss": 0.00012473238166421652, "reward": 0.9178333282470703, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.9025000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.26642072200775146, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 4197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 492.0, "completions/min_length": 417.0, "epoch": 6.173529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.09400524944067001, "kl": 0.013723034877330065, "learning_rate": 8.731141298567919e-07, "loss": 0.00013784135808236897, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 437.25, "completions/min_length": 373.0, "epoch": 6.175, "frac_reward_zero_std": 1.0, "grad_norm": 0.012462902814149857, "kl": 0.007114960113540292, "learning_rate": 8.730286875308497e-07, "loss": 7.096242188708857e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/mean_length": 484.4375, "completions/min_length": 377.0, "epoch": 6.176470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9478513598442078, "kl": 0.008250246406532824, "learning_rate": 8.729432206307217e-07, "loss": 8.23289155960083e-05, "reward": 0.6625000238418579, "reward_std": 0.13678567111492157, "rewards/DrugCombAccuracyCOTORM/mean": 0.6041666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.4901813864707947, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 454.9375, "completions/min_length": 387.0, "epoch": 6.177941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9183520674705505, "kl": 0.011493736645206809, "learning_rate": 8.728577291620381e-07, "loss": 0.000114332367957104, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 4201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 439.125, "completions/min_length": 401.0, "epoch": 6.179411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 2.128110408782959, "kl": 0.0218571899458766, "learning_rate": 8.727722131304307e-07, "loss": 0.000213875129702501, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 693.0, "completions/mean_length": 562.9375, "completions/min_length": 489.0, "epoch": 6.180882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 1.5885324478149414, "kl": 0.01123373256996274, "learning_rate": 8.726866725415334e-07, "loss": 0.00011233612895011902, "reward": 0.5373095273971558, "reward_std": 0.23533570766448975, "rewards/DrugCombAccuracyCOTORM/mean": 0.4502827525138855, "rewards/DrugCombAccuracyCOTORM/std": 0.3097642958164215, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7708333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 4203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 462.1875, "completions/min_length": 358.0, "epoch": 6.182352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.9614549279212952, "kl": 0.008601618465036154, "learning_rate": 8.726011074009813e-07, "loss": 8.604675531387329e-05, "reward": 0.5249999761581421, "reward_std": 0.0707106739282608, "rewards/DrugCombAccuracyCOTORM/mean": 0.46875, "rewards/DrugCombAccuracyCOTORM/std": 0.4989572763442993, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 4204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 445.3125, "completions/min_length": 384.0, "epoch": 6.1838235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.01845131441950798, "kl": 0.006811985746026039, "learning_rate": 8.725155177144112e-07, "loss": 6.790999759687111e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 479.5, "completions/min_length": 412.0, "epoch": 6.185294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.01077567134052515, "kl": 0.006362336105667055, "learning_rate": 8.724299034874614e-07, "loss": 6.368813046719879e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 430.125, "completions/min_length": 377.0, "epoch": 6.186764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.015340778045356274, "kl": 0.007684937561862171, "learning_rate": 8.723442647257722e-07, "loss": 7.713590457569808e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 467.4375, "completions/min_length": 388.0, "epoch": 6.188235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9540268778800964, "kl": 0.008269657962955534, "learning_rate": 8.722586014349849e-07, "loss": 8.21697321953252e-05, "reward": 0.7907444834709167, "reward_std": 0.1564609706401825, "rewards/DrugCombAccuracyCOTORM/mean": 0.7610000371932983, "rewards/DrugCombAccuracyCOTORM/std": 0.36194291710853577, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.819444477558136, "rewards/DrugCombCoverageCOTORM/std": 0.194047212600708, "step": 4208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/mean_length": 513.3125, "completions/min_length": 397.0, "epoch": 6.189705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.010101545602083206, "kl": 0.006786473677493632, "learning_rate": 8.721729136207432e-07, "loss": 6.758579547749832e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 433.125, "completions/min_length": 374.0, "epoch": 6.1911764705882355, "frac_reward_zero_std": 0.0, "grad_norm": 1.2776751518249512, "kl": 0.008543938398361206, "learning_rate": 8.720872012886917e-07, "loss": 8.544325828552246e-05, "reward": 0.8937499523162842, "reward_std": 0.3005203604698181, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/mean_length": 463.0625, "completions/min_length": 361.0, "epoch": 6.192647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.0581729412078857, "kl": 0.008008288568817079, "learning_rate": 8.72001464444477e-07, "loss": 8.02353024482727e-05, "reward": 0.9812500476837158, "reward_std": 0.025877445936203003, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 4211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 453.6875, "completions/min_length": 402.0, "epoch": 6.194117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.8750591278076172, "kl": 0.008828455931507051, "learning_rate": 8.719157030937473e-07, "loss": 8.869171142578125e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 461.6875, "completions/min_length": 410.0, "epoch": 6.195588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.0499111413955688, "kl": 0.016274216468445957, "learning_rate": 8.718299172421524e-07, "loss": 0.00016246704035438597, "reward": 0.7437499761581421, "reward_std": 0.21286733448505402, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 429.1875, "completions/min_length": 371.0, "epoch": 6.197058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.015240543521940708, "kl": 0.008589420933276415, "learning_rate": 8.717441068953435e-07, "loss": 8.648469520267099e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 4214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 459.0, "completions/min_length": 403.0, "epoch": 6.198529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8760860562324524, "kl": 0.0077943820506334305, "learning_rate": 8.716582720589735e-07, "loss": 7.793831900926307e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 424.9375, "completions/min_length": 380.0, "epoch": 6.2, "frac_reward_zero_std": 1.0, "grad_norm": 0.015927724540233612, "kl": 0.00961992610245943, "learning_rate": 8.71572412738697e-07, "loss": 9.679701179265976e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/mean_length": 488.0, "completions/min_length": 365.0, "epoch": 6.201470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8970174193382263, "kl": 0.007763910689391196, "learning_rate": 8.714865289401704e-07, "loss": 7.743731839582324e-05, "reward": 0.7399250268936157, "reward_std": 0.16092084348201752, "rewards/DrugCombAccuracyCOTORM/mean": 0.6944375038146973, "rewards/DrugCombAccuracyCOTORM/std": 0.4076162576675415, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.21489661931991577, "step": 4217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 463.875, "completions/min_length": 383.0, "epoch": 6.202941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.04203474894165993, "kl": 0.009817403159104288, "learning_rate": 8.714006206690514e-07, "loss": 9.791857883101329e-05, "reward": 0.6865000128746033, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6237499713897705, "rewards/DrugCombAccuracyCOTORM/std": 0.38858935236930847, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.12909944355487823, "step": 4218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 480.875, "completions/min_length": 404.0, "epoch": 6.204411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.3082752227783203, "kl": 0.011942162178456783, "learning_rate": 8.713146879309994e-07, "loss": 0.0001187696325359866, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 4219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 453.4375, "completions/min_length": 378.0, "epoch": 6.205882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.0675745010375977, "kl": 0.009585162391886115, "learning_rate": 8.712287307316755e-07, "loss": 9.668060374679044e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 454.375, "completions/min_length": 407.0, "epoch": 6.20735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01621522568166256, "kl": 0.008376337355002761, "learning_rate": 8.711427490767422e-07, "loss": 8.402281673625112e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 422.5, "completions/min_length": 374.0, "epoch": 6.208823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.011172034777700901, "kl": 0.007606630912050605, "learning_rate": 8.710567429718638e-07, "loss": 7.568975706817582e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1007.0, "completions/mean_length": 505.1875, "completions/min_length": 385.0, "epoch": 6.2102941176470585, "frac_reward_zero_std": 0.5, "grad_norm": 0.8966168761253357, "kl": 0.009495687554590404, "learning_rate": 8.709707124227064e-07, "loss": 9.52475966187194e-05, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 442.8125, "completions/min_length": 395.0, "epoch": 6.211764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.3879424333572388, "kl": 0.009590626112185419, "learning_rate": 8.708846574349372e-07, "loss": 9.561330080032349e-05, "reward": 0.75, "reward_std": 0.35523033142089844, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 392.6875, "completions/min_length": 321.0, "epoch": 6.213235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0837304592132568, "kl": 0.009000206948257983, "learning_rate": 8.707985780142251e-07, "loss": 9.030848741531372e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 456.125, "completions/min_length": 416.0, "epoch": 6.214705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.0549315214157104, "kl": 0.009765769820660353, "learning_rate": 8.707124741662412e-07, "loss": 9.604988008504733e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 426.1875, "completions/min_length": 369.0, "epoch": 6.216176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 1.6054730415344238, "kl": 0.011726867407560349, "learning_rate": 8.706263458966579e-07, "loss": 0.00011651217937469482, "reward": 0.637499988079071, "reward_std": 0.4153292179107666, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 4227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 458.375, "completions/min_length": 419.0, "epoch": 6.2176470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.009907667525112629, "kl": 0.0068357615964487195, "learning_rate": 8.705401932111486e-07, "loss": 6.814034713897854e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 456.75, "completions/min_length": 399.0, "epoch": 6.219117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.0741502046585083, "kl": 0.00934683345258236, "learning_rate": 8.704540161153892e-07, "loss": 9.344776481157169e-05, "reward": 0.7124999761581421, "reward_std": 0.24164614081382751, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 4229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 451.0625, "completions/min_length": 375.0, "epoch": 6.220588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.012012179009616375, "kl": 0.007335082395002246, "learning_rate": 8.703678146150566e-07, "loss": 7.309377542696893e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 486.8125, "completions/min_length": 400.0, "epoch": 6.222058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 0.9617428183555603, "kl": 0.007980455760844052, "learning_rate": 8.702815887158296e-07, "loss": 7.895111775724217e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 453.3125, "completions/min_length": 369.0, "epoch": 6.223529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.835080623626709, "kl": 0.010402328334748745, "learning_rate": 8.701953384233889e-07, "loss": 0.00010456889867782593, "reward": 0.8082916736602783, "reward_std": 0.07677031308412552, "rewards/DrugCombAccuracyCOTORM/mean": 0.7694791555404663, "rewards/DrugCombAccuracyCOTORM/std": 0.2649628520011902, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9270833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.17179608345031738, "step": 4232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 413.9375, "completions/min_length": 348.0, "epoch": 6.225, "frac_reward_zero_std": 1.0, "grad_norm": 0.01243564672768116, "kl": 0.007444491027854383, "learning_rate": 8.70109063743416e-07, "loss": 7.479882333427668e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 450.5625, "completions/min_length": 415.0, "epoch": 6.226470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.013856894336640835, "kl": 0.007431676029227674, "learning_rate": 8.700227646815946e-07, "loss": 7.435554289259017e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 4234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 479.1875, "completions/min_length": 381.0, "epoch": 6.227941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.03059658221900463, "kl": 0.006119056022725999, "learning_rate": 8.699364412436098e-07, "loss": 6.191808643052354e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 459.875, "completions/min_length": 375.0, "epoch": 6.229411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.010779793374240398, "kl": 0.007075699861161411, "learning_rate": 8.698500934351487e-07, "loss": 7.044101948849857e-05, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 478.1875, "completions/min_length": 415.0, "epoch": 6.230882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.8269452452659607, "kl": 0.01087557664141059, "learning_rate": 8.697637212618991e-07, "loss": 0.0001092448947019875, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 4237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 463.4375, "completions/min_length": 412.0, "epoch": 6.232352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765980243682861, "kl": 0.009166660136543214, "learning_rate": 8.696773247295515e-07, "loss": 9.213387966156006e-05, "reward": 0.6875, "reward_std": 0.44403791427612305, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 4238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 476.625, "completions/min_length": 426.0, "epoch": 6.233823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.143484354019165, "kl": 0.011056549847126007, "learning_rate": 8.695909038437972e-07, "loss": 0.0001101866364479065, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 446.875, "completions/min_length": 421.0, "epoch": 6.235294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.172460675239563, "kl": 0.010711957467719913, "learning_rate": 8.695044586103295e-07, "loss": 0.00010570883750915527, "reward": 0.7945833206176758, "reward_std": 0.17010116577148438, "rewards/DrugCombAccuracyCOTORM/mean": 0.7562500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.3733965754508972, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.15957117080688477, "step": 4240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 473.125, "completions/min_length": 423.0, "epoch": 6.2367647058823525, "frac_reward_zero_std": 0.5, "grad_norm": 0.9158912897109985, "kl": 0.007373415632173419, "learning_rate": 8.694179890348433e-07, "loss": 7.372349500656128e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 408.25, "completions/min_length": 364.0, "epoch": 6.238235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.023487403988838196, "kl": 0.006295943108852953, "learning_rate": 8.693314951230346e-07, "loss": 6.329963798634708e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 442.25, "completions/min_length": 348.0, "epoch": 6.239705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.025804538279771805, "kl": 0.009974140208214521, "learning_rate": 8.692449768806018e-07, "loss": 9.982455230783671e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 444.375, "completions/min_length": 389.0, "epoch": 6.241176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.011326005682349205, "kl": 0.007413637707941234, "learning_rate": 8.691584343132443e-07, "loss": 7.422184717142954e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/mean_length": 418.25, "completions/min_length": 378.0, "epoch": 6.242647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.028439776971936226, "kl": 0.008383137872442603, "learning_rate": 8.690718674266635e-07, "loss": 8.389382855966687e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 436.25, "completions/min_length": 396.0, "epoch": 6.2441176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.8628733158111572, "kl": 0.009298269636929035, "learning_rate": 8.68985276226562e-07, "loss": 9.271907765651122e-05, "reward": 0.5625, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 4246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 462.9375, "completions/min_length": 408.0, "epoch": 6.245588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.01569235324859619, "kl": 0.009181839879602194, "learning_rate": 8.688986607186444e-07, "loss": 9.055297414306551e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 449.3125, "completions/min_length": 398.0, "epoch": 6.247058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.0618996620178223, "kl": 0.00903942238073796, "learning_rate": 8.688120209086164e-07, "loss": 9.026900806929916e-05, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 4248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/mean_length": 548.25, "completions/min_length": 460.0, "epoch": 6.248529411764705, "frac_reward_zero_std": 0.0, "grad_norm": 1.364341139793396, "kl": 0.009653736604377627, "learning_rate": 8.68725356802186e-07, "loss": 9.691715240478516e-05, "reward": 0.7901041507720947, "reward_std": 0.32504117488861084, "rewards/DrugCombAccuracyCOTORM/mean": 0.7708333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.3542075455188751, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.734375, "rewards/DrugCombCoverageCOTORM/std": 0.678348183631897, "step": 4249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 433.0, "completions/min_length": 401.0, "epoch": 6.25, "frac_reward_zero_std": 1.0, "grad_norm": 0.02361798658967018, "kl": 0.010413188487291336, "learning_rate": 8.68638668405062e-07, "loss": 0.0001047705954988487, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 407.875, "completions/min_length": 351.0, "epoch": 6.251470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.206187129020691, "kl": 0.01275694987270981, "learning_rate": 8.685519557229555e-07, "loss": 0.00012668967247009277, "reward": 0.659375011920929, "reward_std": 0.21030908823013306, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 0.9375, "rewards/DrugCombCOTFormatORM/std": 0.17078252136707306, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/mean_length": 494.5625, "completions/min_length": 417.0, "epoch": 6.252941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.1550512313842773, "kl": 0.0074499547481536865, "learning_rate": 8.684652187615789e-07, "loss": 7.490068674087524e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 434.5625, "completions/min_length": 338.0, "epoch": 6.254411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.7987000942230225, "kl": 0.007918439456261694, "learning_rate": 8.683784575266461e-07, "loss": 7.944517710711807e-05, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 4253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 416.0, "completions/min_length": 337.0, "epoch": 6.2558823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.018478401005268097, "kl": 0.008959484403021634, "learning_rate": 8.682916720238729e-07, "loss": 8.96194251254201e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 459.0, "completions/min_length": 417.0, "epoch": 6.257352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.02385747618973255, "kl": 0.009817417245358229, "learning_rate": 8.682048622589764e-07, "loss": 9.835668606683612e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/mean_length": 411.9375, "completions/min_length": 372.0, "epoch": 6.258823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.02215101197361946, "kl": 0.009487115894444287, "learning_rate": 8.681180282376752e-07, "loss": 9.499504812993109e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/mean_length": 526.4375, "completions/min_length": 447.0, "epoch": 6.260294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.142512321472168, "kl": 0.007783657754771411, "learning_rate": 8.6803116996569e-07, "loss": 7.658451795578003e-05, "reward": 0.1966250240802765, "reward_std": 0.2272362858057022, "rewards/DrugCombAccuracyCOTORM/mean": 0.14552083611488342, "rewards/DrugCombAccuracyCOTORM/std": 0.2712889313697815, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.1979166567325592, "rewards/DrugCombCoverageCOTORM/std": 0.7846177816390991, "step": 4257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 453.125, "completions/min_length": 410.0, "epoch": 6.261764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.030030608177185, "kl": 0.012376777362078428, "learning_rate": 8.679442874487426e-07, "loss": 0.00012290477752685547, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 457.8125, "completions/min_length": 406.0, "epoch": 6.2632352941176475, "frac_reward_zero_std": 0.5, "grad_norm": 1.0001602172851562, "kl": 0.011641740798950195, "learning_rate": 8.678573806925569e-07, "loss": 0.00011513754725456238, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 4259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 450.9375, "completions/min_length": 410.0, "epoch": 6.264705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.2290360927581787, "kl": 0.03574324934743345, "learning_rate": 8.677704497028577e-07, "loss": 0.0003330335021018982, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/mean_length": 506.5625, "completions/min_length": 407.0, "epoch": 6.266176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.1401621103286743, "kl": 0.01021742564626038, "learning_rate": 8.67683494485372e-07, "loss": 0.00010249114711768925, "reward": 0.8843749761581421, "reward_std": 0.2150321900844574, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.5072392821311951, "step": 4261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/mean_length": 480.125, "completions/min_length": 437.0, "epoch": 6.267647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.01442428957670927, "kl": 0.010344843612983823, "learning_rate": 8.675965150458282e-07, "loss": 0.00010387141082901508, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 481.75, "completions/min_length": 404.0, "epoch": 6.269117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.9789706468582153, "kl": 0.008869428420439363, "learning_rate": 8.675095113899563e-07, "loss": 8.88038775883615e-05, "reward": 0.692187488079071, "reward_std": 0.25520801544189453, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.8920949101448059, "step": 4263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 419.875, "completions/min_length": 362.0, "epoch": 6.270588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.0135867595672607, "kl": 0.009527107002213597, "learning_rate": 8.674224835234878e-07, "loss": 9.46149230003357e-05, "reward": 0.559374988079071, "reward_std": 0.04568428173661232, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 0.9375, "rewards/DrugCombCOTFormatORM/std": 0.17078252136707306, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.7187952995300293, "step": 4264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 456.75, "completions/min_length": 392.0, "epoch": 6.272058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.3259786367416382, "kl": 0.00921549997292459, "learning_rate": 8.67335431452156e-07, "loss": 9.261444211006165e-05, "reward": 0.7062499523162842, "reward_std": 0.41509848833084106, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.8139410614967346, "step": 4265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 458.6875, "completions/min_length": 418.0, "epoch": 6.273529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.987608015537262, "kl": 0.008910561446100473, "learning_rate": 8.672483551816955e-07, "loss": 8.887052536010742e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 486.75, "completions/min_length": 446.0, "epoch": 6.275, "frac_reward_zero_std": 1.0, "grad_norm": 0.012850990518927574, "kl": 0.008666217210702598, "learning_rate": 8.671612547178427e-07, "loss": 8.649715164210647e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 497.0625, "completions/min_length": 400.0, "epoch": 6.276470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.2696645259857178, "kl": 0.007600935525260866, "learning_rate": 8.670741300663358e-07, "loss": 7.659941911697388e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 420.6875, "completions/min_length": 364.0, "epoch": 6.277941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 0.8139556646347046, "kl": 0.01206647278741002, "learning_rate": 8.669869812329141e-07, "loss": 0.00012223422527313232, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 432.4375, "completions/min_length": 376.0, "epoch": 6.279411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.017269562929868698, "kl": 0.0078012365847826, "learning_rate": 8.668998082233185e-07, "loss": 7.807606016285717e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 424.25, "completions/min_length": 356.0, "epoch": 6.280882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.8084951043128967, "kl": 0.007927665254101157, "learning_rate": 8.668126110432924e-07, "loss": 7.846951484680176e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 468.8125, "completions/min_length": 393.0, "epoch": 6.2823529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.1283670663833618, "kl": 0.011728020152077079, "learning_rate": 8.667253896985796e-07, "loss": 0.000116690993309021, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 457.5625, "completions/min_length": 368.0, "epoch": 6.283823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.6889035105705261, "kl": 0.008867251453921199, "learning_rate": 8.666381441949262e-07, "loss": 8.831851300783455e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/mean_length": 524.3125, "completions/min_length": 475.0, "epoch": 6.285294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9662288427352905, "kl": 0.009517790516838431, "learning_rate": 8.665508745380799e-07, "loss": 9.612646681489423e-05, "reward": 0.921625018119812, "reward_std": 0.14512230455875397, "rewards/DrugCombAccuracyCOTORM/mean": 0.9059374928474426, "rewards/DrugCombAccuracyCOTORM/std": 0.25702768564224243, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.08539126068353653, "step": 4274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 425.9375, "completions/min_length": 346.0, "epoch": 6.286764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.7859588861465454, "kl": 0.007784594781696796, "learning_rate": 8.664635807337894e-07, "loss": 7.893145084381104e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 478.3125, "completions/min_length": 417.0, "epoch": 6.288235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.020534902811050415, "kl": 0.009229291812516749, "learning_rate": 8.663762627878057e-07, "loss": 9.165103256236762e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 462.4375, "completions/min_length": 425.0, "epoch": 6.2897058823529415, "frac_reward_zero_std": 1.0, "grad_norm": 0.021706916391849518, "kl": 0.007821697276085615, "learning_rate": 8.662889207058808e-07, "loss": 7.75488733779639e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 440.25, "completions/min_length": 380.0, "epoch": 6.291176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.018480956554412842, "kl": 0.010524324956350029, "learning_rate": 8.662015544937691e-07, "loss": 0.00010547332931309938, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 473.375, "completions/min_length": 409.0, "epoch": 6.29264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.7730857729911804, "kl": 0.00792427931446582, "learning_rate": 8.661141641572255e-07, "loss": 7.896125316619873e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 424.25, "completions/min_length": 368.0, "epoch": 6.294117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.1681995689868927, "kl": 0.015469158417545259, "learning_rate": 8.660267497020072e-07, "loss": 0.00015367940068244934, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/mean_length": 487.6875, "completions/min_length": 425.0, "epoch": 6.295588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9040508270263672, "kl": 0.008598709478974342, "learning_rate": 8.659393111338731e-07, "loss": 8.549541234970093e-05, "reward": 0.2782500088214874, "reward_std": 0.21094012260437012, "rewards/DrugCombAccuracyCOTORM/mean": 0.1525000035762787, "rewards/DrugCombAccuracyCOTORM/std": 0.33908700942993164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.786165177822113, "step": 4281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 528.4375, "completions/min_length": 444.0, "epoch": 6.297058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.197574496269226, "kl": 0.008559568086639047, "learning_rate": 8.658518484585833e-07, "loss": 8.604209870100021e-05, "reward": 0.5081610679626465, "reward_std": 0.2459908425807953, "rewards/DrugCombAccuracyCOTORM/mean": 0.40615105628967285, "rewards/DrugCombAccuracyCOTORM/std": 0.4241331219673157, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8324021697044373, "rewards/DrugCombCoverageCOTORM/std": 0.2032076120376587, "step": 4282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 433.6875, "completions/min_length": 374.0, "epoch": 6.298529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.014986073598265648, "kl": 0.008564735995605588, "learning_rate": 8.657643616818995e-07, "loss": 8.621088636573404e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 433.6875, "completions/min_length": 328.0, "epoch": 6.3, "frac_reward_zero_std": 1.0, "grad_norm": 0.029412701725959778, "kl": 0.010804438032209873, "learning_rate": 8.656768508095852e-07, "loss": 0.00010696756362449378, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/mean_length": 468.3125, "completions/min_length": 361.0, "epoch": 6.301470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 2.3318285942077637, "kl": 0.009893701528199017, "learning_rate": 8.655893158474054e-07, "loss": 9.901821613311768e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/mean_length": 525.8125, "completions/min_length": 450.0, "epoch": 6.302941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.132684588432312, "kl": 0.010495041264221072, "learning_rate": 8.655017568011267e-07, "loss": 0.00010482648212928325, "reward": 0.7467396259307861, "reward_std": 0.1217096820473671, "rewards/DrugCombAccuracyCOTORM/mean": 0.6954687833786011, "rewards/DrugCombAccuracyCOTORM/std": 0.37844735383987427, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9036458134651184, "rewards/DrugCombCoverageCOTORM/std": 0.11757459491491318, "step": 4286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/mean_length": 467.875, "completions/min_length": 353.0, "epoch": 6.304411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 1.145229697227478, "kl": 0.010114633711054921, "learning_rate": 8.65414173676517e-07, "loss": 0.00010099634528160095, "reward": 0.44281667470932007, "reward_std": 0.4024885296821594, "rewards/DrugCombAccuracyCOTORM/mean": 0.32487499713897705, "rewards/DrugCombAccuracyCOTORM/std": 0.47261711955070496, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8291666507720947, "rewards/DrugCombCoverageCOTORM/std": 0.5035982131958008, "step": 4287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 471.75, "completions/min_length": 405.0, "epoch": 6.305882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 1.2476316690444946, "kl": 0.008925381698645651, "learning_rate": 8.653265664793466e-07, "loss": 8.835643529891968e-05, "reward": 0.78125, "reward_std": 0.3743184804916382, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 4288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/mean_length": 488.4375, "completions/min_length": 425.0, "epoch": 6.307352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.0624737739562988, "kl": 0.013034585164859891, "learning_rate": 8.652389352153864e-07, "loss": 0.0001307353377342224, "reward": 0.7722083330154419, "reward_std": 0.14223913848400116, "rewards/DrugCombAccuracyCOTORM/mean": 0.7347916960716248, "rewards/DrugCombAccuracyCOTORM/std": 0.36315691471099854, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.22334785759449005, "step": 4289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 444.8125, "completions/min_length": 388.0, "epoch": 6.3088235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 1.0365819931030273, "kl": 0.012753857532516122, "learning_rate": 8.651512798904093e-07, "loss": 0.0001270341599592939, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/mean_length": 392.1875, "completions/min_length": 352.0, "epoch": 6.310294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.011684073135256767, "kl": 0.00867827981710434, "learning_rate": 8.650636005101902e-07, "loss": 8.703875937499106e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 465.25, "completions/min_length": 407.0, "epoch": 6.311764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.013968616724014282, "kl": 0.007362053147517145, "learning_rate": 8.649758970805048e-07, "loss": 7.356790592893958e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 455.5625, "completions/min_length": 403.0, "epoch": 6.313235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0443557500839233, "kl": 0.010132120456546545, "learning_rate": 8.64888169607131e-07, "loss": 0.00010052323341369629, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/mean_length": 401.9375, "completions/min_length": 337.0, "epoch": 6.314705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.018387751653790474, "kl": 0.00663614843506366, "learning_rate": 8.64800418095848e-07, "loss": 6.65756015223451e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 432.5, "completions/min_length": 385.0, "epoch": 6.3161764705882355, "frac_reward_zero_std": 0.5, "grad_norm": 0.8039642572402954, "kl": 0.006731565110385418, "learning_rate": 8.647126425524367e-07, "loss": 6.731599569320679e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 509.5, "completions/min_length": 389.0, "epoch": 6.317647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.0648376941680908, "kl": 0.009887086926028132, "learning_rate": 8.646248429826793e-07, "loss": 0.0001004216173896566, "reward": 0.9813802242279053, "reward_std": 0.05266471579670906, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.98046875, "rewards/DrugCombCoverageCOTORM/std": 0.078125, "step": 4296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 439.125, "completions/min_length": 390.0, "epoch": 6.319117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.008662046864628792, "kl": 0.006883651367388666, "learning_rate": 8.645370193923602e-07, "loss": 6.875026156194508e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 431.6875, "completions/min_length": 400.0, "epoch": 6.320588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.00799164641648531, "kl": 0.005412225844338536, "learning_rate": 8.644491717872647e-07, "loss": 5.414834595285356e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/mean_length": 395.5625, "completions/min_length": 352.0, "epoch": 6.322058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.012635434046387672, "kl": 0.009717393550090492, "learning_rate": 8.643613001731801e-07, "loss": 9.774042700883001e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/mean_length": 512.6875, "completions/min_length": 410.0, "epoch": 6.323529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.7852869033813477, "kl": 0.009026080835610628, "learning_rate": 8.642734045558951e-07, "loss": 9.053200483322144e-05, "reward": 0.8285146951675415, "reward_std": 0.15553118288516998, "rewards/DrugCombAccuracyCOTORM/mean": 0.7926746606826782, "rewards/DrugCombAccuracyCOTORM/std": 0.3438369929790497, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9437500238418579, "rewards/DrugCombCoverageCOTORM/std": 0.12093387544155121, "step": 4300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 444.9375, "completions/min_length": 388.0, "epoch": 6.325, "frac_reward_zero_std": 0.5, "grad_norm": 1.132763147354126, "kl": 0.00855394045356661, "learning_rate": 8.641854849412e-07, "loss": 8.485450234729797e-05, "reward": 0.9043715596199036, "reward_std": 0.14798161387443542, "rewards/DrugCombAccuracyCOTORM/mean": 0.8851519823074341, "rewards/DrugCombAccuracyCOTORM/std": 0.2686181664466858, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9624999761581421, "rewards/DrugCombCoverageCOTORM/std": 0.1087811216711998, "step": 4301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 478.0625, "completions/min_length": 418.0, "epoch": 6.326470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.1078892946243286, "kl": 0.007315038703382015, "learning_rate": 8.64097541334887e-07, "loss": 7.339897274505347e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 437.1875, "completions/min_length": 356.0, "epoch": 6.327941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9508563280105591, "kl": 0.010189251275733113, "learning_rate": 8.640095737427493e-07, "loss": 0.00010181963443756104, "reward": 0.887499988079071, "reward_std": 0.21001701056957245, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 439.625, "completions/min_length": 369.0, "epoch": 6.329411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9663699865341187, "kl": 0.007427577744238079, "learning_rate": 8.63921582170582e-07, "loss": 7.405132055282593e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 443.9375, "completions/min_length": 389.0, "epoch": 6.330882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.9266196489334106, "kl": 0.008456603274680674, "learning_rate": 8.638335666241819e-07, "loss": 8.488446474075317e-05, "reward": 0.8374999761581421, "reward_std": 0.22638462483882904, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 415.5, "completions/min_length": 376.0, "epoch": 6.33235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.009155475534498692, "kl": 0.007306164945475757, "learning_rate": 8.637455271093472e-07, "loss": 7.26920843590051e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/mean_length": 499.25, "completions/min_length": 410.0, "epoch": 6.333823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.9652360677719116, "kl": 0.009713058709166944, "learning_rate": 8.636574636318776e-07, "loss": 9.660597424954176e-05, "reward": 0.8205000162124634, "reward_std": 0.1792791187763214, "rewards/DrugCombAccuracyCOTORM/mean": 0.7912499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.37342333793640137, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 4307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 429.625, "completions/min_length": 391.0, "epoch": 6.3352941176470585, "frac_reward_zero_std": 1.0, "grad_norm": 1.0231996774673462, "kl": 0.022842913633212447, "learning_rate": 8.635693761975746e-07, "loss": 0.0002210372913395986, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/mean_length": 494.125, "completions/min_length": 384.0, "epoch": 6.336764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.069162130355835, "kl": 0.006295755971223116, "learning_rate": 8.634812648122413e-07, "loss": 6.311386823654175e-05, "reward": 0.9551249742507935, "reward_std": 0.12692566215991974, "rewards/DrugCombAccuracyCOTORM/mean": 0.9478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.20874999463558197, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 4309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 435.375, "completions/min_length": 387.0, "epoch": 6.338235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.010921570472419262, "kl": 0.007060295669361949, "learning_rate": 8.63393129481682e-07, "loss": 7.01551980455406e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/mean_length": 536.4375, "completions/min_length": 419.0, "epoch": 6.339705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.0136828422546387, "kl": 0.011357651790603995, "learning_rate": 8.633049702117031e-07, "loss": 0.00011419504880905151, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 4311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 455.6875, "completions/min_length": 380.0, "epoch": 6.341176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.064670443534851, "kl": 0.010634931037202477, "learning_rate": 8.632167870081121e-07, "loss": 0.00010728836059570312, "reward": 0.887499988079071, "reward_std": 0.21001699566841125, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 504.3125, "completions/min_length": 435.0, "epoch": 6.3426470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.9259809255599976, "kl": 0.010608537239022553, "learning_rate": 8.631285798767182e-07, "loss": 0.00010755658149719238, "reward": 0.9900000095367432, "reward_std": 0.02828424982726574, "rewards/DrugCombAccuracyCOTORM/mean": 0.987500011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.05000000074505806, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 467.1875, "completions/min_length": 424.0, "epoch": 6.344117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.06711702048778534, "kl": 0.0096712710801512, "learning_rate": 8.630403488233326e-07, "loss": 9.587723616277799e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 409.5625, "completions/min_length": 333.0, "epoch": 6.345588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.02068750187754631, "kl": 0.008328780182637274, "learning_rate": 8.629520938537675e-07, "loss": 8.3846491179429e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 418.3125, "completions/min_length": 359.0, "epoch": 6.347058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.012741140089929104, "kl": 0.007393530453555286, "learning_rate": 8.628638149738371e-07, "loss": 7.407458906527609e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/mean_length": 527.375, "completions/min_length": 417.0, "epoch": 6.348529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.015039361082017422, "kl": 0.00787764135748148, "learning_rate": 8.627755121893568e-07, "loss": 7.88427860243246e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 4317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 457.125, "completions/min_length": 413.0, "epoch": 6.35, "frac_reward_zero_std": 1.0, "grad_norm": 0.011137355118989944, "kl": 0.006898446707054973, "learning_rate": 8.626871855061437e-07, "loss": 6.922972534084693e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 471.6875, "completions/min_length": 415.0, "epoch": 6.351470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.7658488750457764, "kl": 0.010299637680873275, "learning_rate": 8.625988349300169e-07, "loss": 0.00010200589895248413, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 4319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 458.875, "completions/min_length": 405.0, "epoch": 6.352941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.016715778037905693, "kl": 0.009618534008041024, "learning_rate": 8.625104604667963e-07, "loss": 9.621055505704135e-05, "reward": 0.6713333129882812, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6100000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.40279027819633484, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 4320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 433.0, "completions/min_length": 381.0, "epoch": 6.354411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.8694031834602356, "kl": 0.0076692544389516115, "learning_rate": 8.624220621223039e-07, "loss": 7.687778997933492e-05, "reward": 0.9937499761581421, "reward_std": 0.017677659168839455, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 510.375, "completions/min_length": 455.0, "epoch": 6.355882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.9310352206230164, "kl": 0.01060471823439002, "learning_rate": 8.623336399023635e-07, "loss": 0.00010490110435057431, "reward": 0.5895833373069763, "reward_std": 0.1119585856795311, "rewards/DrugCombAccuracyCOTORM/mean": 0.5416666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 4322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 467.125, "completions/min_length": 415.0, "epoch": 6.357352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.8956692814826965, "kl": 0.010245502926409245, "learning_rate": 8.622451938127997e-07, "loss": 0.0001023691293084994, "reward": 0.8374999761581421, "reward_std": 0.22638463973999023, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/mean_length": 509.8125, "completions/min_length": 418.0, "epoch": 6.358823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.9525249004364014, "kl": 0.01810739701613784, "learning_rate": 8.621567238594391e-07, "loss": 0.00018133968114852905, "reward": 0.8667083382606506, "reward_std": 0.11757273226976395, "rewards/DrugCombAccuracyCOTORM/mean": 0.8424999713897705, "rewards/DrugCombAccuracyCOTORM/std": 0.2566666603088379, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9270833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.08539126813411713, "step": 4324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 436.8125, "completions/min_length": 372.0, "epoch": 6.360294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.996649444103241, "kl": 0.010371598415076733, "learning_rate": 8.620682300481102e-07, "loss": 0.00010295171523466706, "reward": 0.7124999761581421, "reward_std": 0.2386719137430191, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.6191391944885254, "step": 4325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 410.0625, "completions/min_length": 348.0, "epoch": 6.3617647058823525, "frac_reward_zero_std": 1.0, "grad_norm": 0.009519322775304317, "kl": 0.00749229034408927, "learning_rate": 8.619797123846426e-07, "loss": 7.480211206711829e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/mean_length": 516.5, "completions/min_length": 430.0, "epoch": 6.363235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.944148600101471, "kl": 0.010590157238766551, "learning_rate": 8.618911708748676e-07, "loss": 0.00010596212086966261, "reward": 0.5318333506584167, "reward_std": 0.0634627714753151, "rewards/DrugCombAccuracyCOTORM/mean": 0.5137500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.5050000548362732, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.2083333432674408, "rewards/DrugCombCoverageCOTORM/std": 0.9727776646614075, "step": 4327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 452.5625, "completions/min_length": 401.0, "epoch": 6.364705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.010835623368620872, "kl": 0.007826573681086302, "learning_rate": 8.618026055246182e-07, "loss": 7.839668251108378e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 459.5625, "completions/min_length": 428.0, "epoch": 6.366176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.015528429299592972, "kl": 0.008810139261186123, "learning_rate": 8.617140163397285e-07, "loss": 8.750653069000691e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 435.375, "completions/min_length": 344.0, "epoch": 6.367647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.8252681493759155, "kl": 0.008041052613407373, "learning_rate": 8.616254033260349e-07, "loss": 8.071959018707275e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 461.5, "completions/min_length": 414.0, "epoch": 6.3691176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.983687162399292, "kl": 0.008760566706769168, "learning_rate": 8.615367664893749e-07, "loss": 8.74652323545888e-05, "reward": 0.4937500059604645, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 4331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/mean_length": 534.75, "completions/min_length": 397.0, "epoch": 6.370588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.3338934183120728, "kl": 0.008610670454800129, "learning_rate": 8.614481058355877e-07, "loss": 8.498132228851318e-05, "reward": 0.9208333492279053, "reward_std": 0.1763271689414978, "rewards/DrugCombAccuracyCOTORM/mean": 0.9166666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 421.3125, "completions/min_length": 352.0, "epoch": 6.372058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.04783501848578453, "kl": 0.011282134102657437, "learning_rate": 8.61359421370514e-07, "loss": 0.00011226562492083758, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 470.875, "completions/min_length": 375.0, "epoch": 6.373529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.8055718541145325, "kl": 0.010641931323334575, "learning_rate": 8.61270713099996e-07, "loss": 0.00010633204510668293, "reward": 0.840916633605957, "reward_std": 0.13497698307037354, "rewards/DrugCombAccuracyCOTORM/mean": 0.8193750381469727, "rewards/DrugCombAccuracyCOTORM/std": 0.2844206988811493, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.24247947335243225, "step": 4334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 481.5, "completions/min_length": 415.0, "epoch": 6.375, "frac_reward_zero_std": 1.0, "grad_norm": 0.01011938787996769, "kl": 0.008085667970590293, "learning_rate": 8.611819810298777e-07, "loss": 8.089775656117126e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 446.0625, "completions/min_length": 393.0, "epoch": 6.376470588235295, "frac_reward_zero_std": 0.0, "grad_norm": 1.1735246181488037, "kl": 0.013288839254528284, "learning_rate": 8.610932251660046e-07, "loss": 0.00013199448585510254, "reward": 0.8812500238418579, "reward_std": 0.3358757197856903, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5439056158065796, "step": 4336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 429.75, "completions/min_length": 391.0, "epoch": 6.377941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.03499123454093933, "kl": 0.010121069848537445, "learning_rate": 8.610044455142237e-07, "loss": 0.0001009800544125028, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 424.625, "completions/min_length": 370.0, "epoch": 6.379411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9509227275848389, "kl": 0.00894540164154023, "learning_rate": 8.609156420803834e-07, "loss": 8.897483348846436e-05, "reward": 0.4437499940395355, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.0625, "rewards/DrugCombCoverageCOTORM/std": 0.9979145526885986, "step": 4338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 467.75, "completions/min_length": 403.0, "epoch": 6.3808823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.009836405515670776, "kl": 0.007948762038722634, "learning_rate": 8.608268148703339e-07, "loss": 7.946621917653829e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 4339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 477.8125, "completions/min_length": 411.0, "epoch": 6.382352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.799045205116272, "kl": 0.0089242187095806, "learning_rate": 8.607379638899272e-07, "loss": 8.86768102645874e-05, "reward": 0.9391250014305115, "reward_std": 0.12156426906585693, "rewards/DrugCombAccuracyCOTORM/mean": 0.9304167032241821, "rewards/DrugCombAccuracyCOTORM/std": 0.2068883180618286, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9479166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.145535409450531, "step": 4340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 485.5, "completions/min_length": 422.0, "epoch": 6.383823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.784676730632782, "kl": 0.0075655714608728886, "learning_rate": 8.606490891450162e-07, "loss": 7.61672854423523e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 445.125, "completions/min_length": 380.0, "epoch": 6.385294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.7455452084541321, "kl": 0.008248085156083107, "learning_rate": 8.605601906414559e-07, "loss": 8.32842051750049e-05, "reward": 0.9551249742507935, "reward_std": 0.12692566215991974, "rewards/DrugCombAccuracyCOTORM/mean": 0.9478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.20874999463558197, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 4342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 468.0625, "completions/min_length": 418.0, "epoch": 6.386764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8549734354019165, "kl": 0.010262819938361645, "learning_rate": 8.604712683851027e-07, "loss": 0.00010243058204650879, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 4343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 481.6875, "completions/min_length": 439.0, "epoch": 6.3882352941176475, "frac_reward_zero_std": 0.5, "grad_norm": 0.9403702616691589, "kl": 0.008460226003080606, "learning_rate": 8.603823223818146e-07, "loss": 8.470470493193716e-05, "reward": 0.75, "reward_std": 0.26726123690605164, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 4344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 448.0, "completions/min_length": 383.0, "epoch": 6.389705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.01983647421002388, "kl": 0.011361631564795971, "learning_rate": 8.602933526374511e-07, "loss": 0.00011337806063238531, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 4345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 492.625, "completions/min_length": 425.0, "epoch": 6.391176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.014428837224841118, "kl": 0.009617529110983014, "learning_rate": 8.602043591578734e-07, "loss": 9.629975829739124e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 459.25, "completions/min_length": 392.0, "epoch": 6.392647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.7834718823432922, "kl": 0.007665058132261038, "learning_rate": 8.60115341948944e-07, "loss": 7.658451795578003e-05, "reward": 0.8374999761581421, "reward_std": 0.22638463973999023, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 462.5, "completions/min_length": 382.0, "epoch": 6.394117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.0951874256134033, "kl": 0.013618062483146787, "learning_rate": 8.600263010165273e-07, "loss": 0.000134973248350434, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 451.0, "completions/min_length": 375.0, "epoch": 6.395588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 19.405797958374023, "kl": 0.10406198923010379, "learning_rate": 8.599372363664889e-07, "loss": 0.0010676676174625754, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 451.1875, "completions/min_length": 388.0, "epoch": 6.397058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.7525874376296997, "kl": 0.007418446708470583, "learning_rate": 8.598481480046963e-07, "loss": 7.3947012424469e-05, "reward": 0.9551249742507935, "reward_std": 0.12692566215991974, "rewards/DrugCombAccuracyCOTORM/mean": 0.9478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.20874999463558197, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 4350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 458.5, "completions/min_length": 385.0, "epoch": 6.398529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.2923442125320435, "kl": 0.014115381985902786, "learning_rate": 8.597590359370184e-07, "loss": 0.0001410171389579773, "reward": 0.762499988079071, "reward_std": 0.25599944591522217, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 4351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 472.75, "completions/min_length": 432.0, "epoch": 6.4, "frac_reward_zero_std": 0.5, "grad_norm": 0.9749102592468262, "kl": 0.012214223854243755, "learning_rate": 8.596699001693255e-07, "loss": 0.00012172268179710954, "reward": 0.8191125392913818, "reward_std": 0.06985677778720856, "rewards/DrugCombAccuracyCOTORM/mean": 0.7922500371932983, "rewards/DrugCombAccuracyCOTORM/std": 0.24563953280448914, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8531249761581421, "rewards/DrugCombCoverageCOTORM/std": 0.152171790599823, "step": 4352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/mean_length": 472.5625, "completions/min_length": 403.0, "epoch": 6.401470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.534089207649231, "kl": 0.010828540194779634, "learning_rate": 8.5958074070749e-07, "loss": 0.00010864436626434326, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 472.25, "completions/min_length": 433.0, "epoch": 6.402941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 1.9863572120666504, "kl": 0.010861794813536108, "learning_rate": 8.594915575573851e-07, "loss": 0.00010911849676631391, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 4354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 441.75, "completions/min_length": 381.0, "epoch": 6.404411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.015867194160819054, "kl": 0.00878667610231787, "learning_rate": 8.594023507248862e-07, "loss": 8.829854778014123e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 439.625, "completions/min_length": 386.0, "epoch": 6.405882352941177, "frac_reward_zero_std": 0.0, "grad_norm": 1.4674768447875977, "kl": 0.009586137370206416, "learning_rate": 8.593131202158701e-07, "loss": 9.499490261077881e-05, "reward": 0.6343749761581421, "reward_std": 0.36406826972961426, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.5072392821311951, "step": 4356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 455.3125, "completions/min_length": 416.0, "epoch": 6.4073529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.1616121381521225, "kl": 0.01281687815207988, "learning_rate": 8.592238660362148e-07, "loss": 0.00012889540812466294, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 716.0, "completions/mean_length": 553.5625, "completions/min_length": 435.0, "epoch": 6.408823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.04270171374082565, "kl": 0.012422681204043329, "learning_rate": 8.591345881918004e-07, "loss": 0.00012373688514344394, "reward": 0.800000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 462.8125, "completions/min_length": 400.0, "epoch": 6.410294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.014467001892626286, "kl": 0.0081408207770437, "learning_rate": 8.590452866885081e-07, "loss": 8.105395681923255e-05, "reward": 0.8416666984558105, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.8333333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.17213258147239685, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 4359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 470.0, "completions/min_length": 412.0, "epoch": 6.411764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0503684282302856, "kl": 0.011784943984821439, "learning_rate": 8.589559615322209e-07, "loss": 0.00011818110942840576, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 4360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/mean_length": 473.3125, "completions/min_length": 421.0, "epoch": 6.413235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.08632116764783859, "kl": 0.010739470948465168, "learning_rate": 8.588666127288235e-07, "loss": 0.00010904792725341395, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 468.5, "completions/min_length": 395.0, "epoch": 6.4147058823529415, "frac_reward_zero_std": 0.0, "grad_norm": 1.573637843132019, "kl": 0.013874656753614545, "learning_rate": 8.587772402842017e-07, "loss": 0.00013899803161621094, "reward": 0.2874999940395355, "reward_std": 0.2636806070804596, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.125, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 4362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 413.5625, "completions/min_length": 360.0, "epoch": 6.416176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.009979856200516224, "kl": 0.0067948654759675264, "learning_rate": 8.586878442042433e-07, "loss": 6.798980757594109e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 455.375, "completions/min_length": 414.0, "epoch": 6.41764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.02211817353963852, "kl": 0.009447980904951692, "learning_rate": 8.585984244948374e-07, "loss": 9.456838597543538e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 4364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 450.1875, "completions/min_length": 405.0, "epoch": 6.419117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.1782928705215454, "kl": 0.010286577045917511, "learning_rate": 8.58508981161875e-07, "loss": 0.00010357052087783813, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 693.0, "completions/mean_length": 530.375, "completions/min_length": 410.0, "epoch": 6.420588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.017342742532491684, "kl": 0.010142459999769926, "learning_rate": 8.584195142112481e-07, "loss": 0.00010097319318447262, "reward": 0.800000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/mean_length": 481.3125, "completions/min_length": 401.0, "epoch": 6.422058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.6681981086730957, "kl": 0.008725734078325331, "learning_rate": 8.583300236488505e-07, "loss": 8.589401841163635e-05, "reward": 0.7563750147819519, "reward_std": 0.20331111550331116, "rewards/DrugCombAccuracyCOTORM/mean": 0.7150000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.4387064278125763, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.23935678601264954, "step": 4367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/mean_length": 375.375, "completions/min_length": 295.0, "epoch": 6.423529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9221690893173218, "kl": 0.007927661295980215, "learning_rate": 8.582405094805779e-07, "loss": 7.998943328857422e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 430.4375, "completions/min_length": 357.0, "epoch": 6.425, "frac_reward_zero_std": 0.5, "grad_norm": 1.137854814529419, "kl": 0.012305675074458122, "learning_rate": 8.581509717123272e-07, "loss": 0.00012189894914627075, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 448.5625, "completions/min_length": 374.0, "epoch": 6.426470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.02298947609961033, "kl": 0.010930947260931134, "learning_rate": 8.580614103499966e-07, "loss": 0.00011045794235542417, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 432.0625, "completions/min_length": 385.0, "epoch": 6.427941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.4885132312774658, "kl": 0.011852201307192445, "learning_rate": 8.579718253994866e-07, "loss": 0.00011837482452392578, "reward": 0.7856666445732117, "reward_std": 0.31710106134414673, "rewards/DrugCombAccuracyCOTORM/mean": 0.7425000071525574, "rewards/DrugCombAccuracyCOTORM/std": 0.39771851897239685, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.14907118678092957, "step": 4371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 689.0, "completions/mean_length": 522.75, "completions/min_length": 420.0, "epoch": 6.429411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.7101122736930847, "kl": 0.007071349304169416, "learning_rate": 8.578822168666987e-07, "loss": 7.038042531348765e-05, "reward": 0.7387088537216187, "reward_std": 0.12069596350193024, "rewards/DrugCombAccuracyCOTORM/mean": 0.6872923374176025, "rewards/DrugCombAccuracyCOTORM/std": 0.37325048446655273, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8887500166893005, "rewards/DrugCombCoverageCOTORM/std": 0.20369504392147064, "step": 4372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 424.875, "completions/min_length": 376.0, "epoch": 6.430882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.013888522051274776, "kl": 0.007934608613140881, "learning_rate": 8.577925847575359e-07, "loss": 7.986484706634656e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 444.125, "completions/min_length": 371.0, "epoch": 6.432352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.6635950803756714, "kl": 0.011008683824911714, "learning_rate": 8.577029290779032e-07, "loss": 0.00010961294174194336, "reward": 0.699999988079071, "reward_std": 0.3484410345554352, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 444.5625, "completions/min_length": 398.0, "epoch": 6.4338235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 1.059299111366272, "kl": 0.010330956894904375, "learning_rate": 8.576132498337067e-07, "loss": 0.00010348111391067505, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/mean_length": 470.0625, "completions/min_length": 405.0, "epoch": 6.435294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9521111845970154, "kl": 0.010939836967736483, "learning_rate": 8.575235470308543e-07, "loss": 0.00010958602069877088, "reward": 0.5532500147819519, "reward_std": 0.07834918797016144, "rewards/DrugCombAccuracyCOTORM/mean": 0.5274999737739563, "rewards/DrugCombAccuracyCOTORM/std": 0.49293002486228943, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3125, "rewards/DrugCombCoverageCOTORM/std": 0.9227073788642883, "step": 4376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 479.75, "completions/min_length": 403.0, "epoch": 6.436764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9441611766815186, "kl": 0.011170513229444623, "learning_rate": 8.574338206752554e-07, "loss": 0.00011224144691368565, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/mean_length": 509.3125, "completions/min_length": 380.0, "epoch": 6.438235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8488951325416565, "kl": 0.008790373685769737, "learning_rate": 8.57344070772821e-07, "loss": 8.815195178613067e-05, "reward": 0.6887302398681641, "reward_std": 0.14086492359638214, "rewards/DrugCombAccuracyCOTORM/mean": 0.6343502998352051, "rewards/DrugCombAccuracyCOTORM/std": 0.44072675704956055, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.2580992579460144, "step": 4378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 414.5625, "completions/min_length": 369.0, "epoch": 6.439705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.017763186246156693, "kl": 0.01141495374031365, "learning_rate": 8.572542973294634e-07, "loss": 0.00011469061428215355, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 473.125, "completions/min_length": 417.0, "epoch": 6.4411764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.012275071814656258, "kl": 0.0074298769468441606, "learning_rate": 8.57164500351097e-07, "loss": 7.404440111713484e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 451.375, "completions/min_length": 401.0, "epoch": 6.442647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.01607060246169567, "kl": 0.009690218488685787, "learning_rate": 8.57074679843637e-07, "loss": 9.61101904977113e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 433.25, "completions/min_length": 385.0, "epoch": 6.444117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.0475560426712036, "kl": 0.009706780430860817, "learning_rate": 8.569848358130009e-07, "loss": 9.734688501339406e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 461.3125, "completions/min_length": 355.0, "epoch": 6.445588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.9810686707496643, "kl": 0.008607096620835364, "learning_rate": 8.568949682651072e-07, "loss": 8.520422852598131e-05, "reward": 0.909250020980835, "reward_std": 0.1701066941022873, "rewards/DrugCombAccuracyCOTORM/mean": 0.8904687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.30268827080726624, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.08539126068353653, "step": 4383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/mean_length": 504.6875, "completions/min_length": 381.0, "epoch": 6.447058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.755885899066925, "kl": 0.007437311462126672, "learning_rate": 8.568050772058761e-07, "loss": 7.545948028564453e-05, "reward": 0.6444504261016846, "reward_std": 0.20829643309116364, "rewards/DrugCombAccuracyCOTORM/mean": 0.6344320774078369, "rewards/DrugCombAccuracyCOTORM/std": 0.4658283293247223, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.369047611951828, "rewards/DrugCombCoverageCOTORM/std": 0.9535707235336304, "step": 4384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 492.6875, "completions/min_length": 437.0, "epoch": 6.448529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.813895046710968, "kl": 0.006826927070505917, "learning_rate": 8.567151626412295e-07, "loss": 6.803125143051147e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 476.5625, "completions/min_length": 400.0, "epoch": 6.45, "frac_reward_zero_std": 0.5, "grad_norm": 1.0809657573699951, "kl": 0.009924400364980102, "learning_rate": 8.566252245770907e-07, "loss": 0.00010097767517436296, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/mean_length": 463.625, "completions/min_length": 371.0, "epoch": 6.451470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8348227739334106, "kl": 0.008930793264880776, "learning_rate": 8.565352630193848e-07, "loss": 8.968615293269977e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 4387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/mean_length": 531.5625, "completions/min_length": 412.0, "epoch": 6.452941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.3897957801818848, "kl": 0.008587955497205257, "learning_rate": 8.564452779740378e-07, "loss": 8.501112461090088e-05, "reward": 0.510783314704895, "reward_std": 0.23811236023902893, "rewards/DrugCombAccuracyCOTORM/mean": 0.41868749260902405, "rewards/DrugCombAccuracyCOTORM/std": 0.43174174427986145, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7583333253860474, "rewards/DrugCombCoverageCOTORM/std": 0.33277732133865356, "step": 4388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 475.0625, "completions/min_length": 413.0, "epoch": 6.454411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.037655234336853, "kl": 0.010177448159083724, "learning_rate": 8.56355269446978e-07, "loss": 0.00010384472989244387, "reward": 0.550000011920929, "reward_std": 0.053452249616384506, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 4389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/mean_length": 415.75, "completions/min_length": 369.0, "epoch": 6.455882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.0708087682724, "kl": 0.009881129721179605, "learning_rate": 8.562652374441349e-07, "loss": 9.85860824584961e-05, "reward": 0.543749988079071, "reward_std": 0.0176776684820652, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.6291528940200806, "step": 4390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/mean_length": 507.375, "completions/min_length": 410.0, "epoch": 6.45735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.945048451423645, "kl": 0.013932307250797749, "learning_rate": 8.561751819714396e-07, "loss": 0.000141934011480771, "reward": 0.731249988079071, "reward_std": 0.2103356122970581, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 4391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 426.0, "completions/min_length": 370.0, "epoch": 6.458823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.03118768334388733, "kl": 0.010408699163235724, "learning_rate": 8.560851030348246e-07, "loss": 0.00010197548544965684, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 491.75, "completions/min_length": 460.0, "epoch": 6.4602941176470585, "frac_reward_zero_std": 1.0, "grad_norm": 0.014889854937791824, "kl": 0.008595210034400225, "learning_rate": 8.559950006402241e-07, "loss": 8.595543476985767e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 437.75, "completions/min_length": 375.0, "epoch": 6.461764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.015917330980300903, "kl": 0.00963166484143585, "learning_rate": 8.55904874793574e-07, "loss": 9.667566337157041e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 473.4375, "completions/min_length": 411.0, "epoch": 6.463235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.016275258734822273, "kl": 0.008605318260379136, "learning_rate": 8.558147255008115e-07, "loss": 8.563131268601865e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 420.1875, "completions/min_length": 397.0, "epoch": 6.464705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.016810715198516846, "kl": 0.00826275662984699, "learning_rate": 8.557245527678751e-07, "loss": 8.234746928792447e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/mean_length": 502.0, "completions/min_length": 417.0, "epoch": 6.466176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.1298482418060303, "kl": 0.011342198122292757, "learning_rate": 8.556343566007056e-07, "loss": 0.00011428189463913441, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 443.5, "completions/min_length": 350.0, "epoch": 6.4676470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.4015074968338013, "kl": 0.014810294611379504, "learning_rate": 8.555441370052445e-07, "loss": 0.00014842301607131958, "reward": 0.4437499940395355, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.0625, "rewards/DrugCombCoverageCOTORM/std": 0.9979145526885986, "step": 4398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 474.4375, "completions/min_length": 418.0, "epoch": 6.469117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.8674062490463257, "kl": 0.007894852431491017, "learning_rate": 8.554538939874356e-07, "loss": 7.826444925740361e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 4399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 417.5625, "completions/min_length": 344.0, "epoch": 6.470588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.0322282649576664, "kl": 0.007877349737100303, "learning_rate": 8.553636275532236e-07, "loss": 7.906679820735008e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 446.625, "completions/min_length": 408.0, "epoch": 6.472058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.01050709281116724, "kl": 0.00856411806307733, "learning_rate": 8.552733377085552e-07, "loss": 8.629349758848548e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/mean_length": 465.4375, "completions/min_length": 387.0, "epoch": 6.473529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.062923550605774, "kl": 0.009191339369863272, "learning_rate": 8.551830244593783e-07, "loss": 9.196251630783081e-05, "reward": 0.512499988079071, "reward_std": 0.0353553369641304, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 4402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 482.375, "completions/min_length": 415.0, "epoch": 6.475, "frac_reward_zero_std": 0.5, "grad_norm": 0.9340819120407104, "kl": 0.010412350995466113, "learning_rate": 8.550926878116427e-07, "loss": 0.0001036282628774643, "reward": 0.609499990940094, "reward_std": 0.038851361721754074, "rewards/DrugCombAccuracyCOTORM/mean": 0.5274999737739563, "rewards/DrugCombAccuracyCOTORM/std": 0.49293002486228943, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.1666666567325592, "step": 4403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 489.5625, "completions/min_length": 425.0, "epoch": 6.476470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.918882429599762, "kl": 0.006680779159069061, "learning_rate": 8.550023277712995e-07, "loss": 6.734640192007646e-05, "reward": 0.921625018119812, "reward_std": 0.14512230455875397, "rewards/DrugCombAccuracyCOTORM/mean": 0.9059374928474426, "rewards/DrugCombAccuracyCOTORM/std": 0.25702768564224243, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.08539126068353653, "step": 4404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 464.125, "completions/min_length": 424.0, "epoch": 6.477941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.020208612084388733, "kl": 0.00721478252671659, "learning_rate": 8.549119443443011e-07, "loss": 7.237678801175207e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 486.4375, "completions/min_length": 397.0, "epoch": 6.479411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.424221396446228, "kl": 0.008909025928005576, "learning_rate": 8.548215375366021e-07, "loss": 8.846074342727661e-05, "reward": 0.9125000238418579, "reward_std": 0.2474873661994934, "rewards/DrugCombAccuracyCOTORM/mean": 0.90625, "rewards/DrugCombAccuracyCOTORM/std": 0.2719528079032898, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 423.8125, "completions/min_length": 354.0, "epoch": 6.480882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.018747586756944656, "kl": 0.011573616182431579, "learning_rate": 8.547311073541584e-07, "loss": 0.00011593892122618854, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 474.4375, "completions/min_length": 418.0, "epoch": 6.482352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.8525363802909851, "kl": 0.010036510182544589, "learning_rate": 8.546406538029267e-07, "loss": 0.00010034405568148941, "reward": 0.40000003576278687, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 4408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 455.375, "completions/min_length": 389.0, "epoch": 6.483823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.0012025833129883, "kl": 0.00937767163850367, "learning_rate": 8.545501768888664e-07, "loss": 9.403473814018071e-05, "reward": 0.6421874761581421, "reward_std": 0.14622291922569275, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 432.9375, "completions/min_length": 378.0, "epoch": 6.485294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.014095117338001728, "kl": 0.008499422459863126, "learning_rate": 8.544596766179376e-07, "loss": 8.503886056132615e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 427.125, "completions/min_length": 391.0, "epoch": 6.4867647058823525, "frac_reward_zero_std": 1.0, "grad_norm": 0.043252721428871155, "kl": 0.010912092518992722, "learning_rate": 8.543691529961023e-07, "loss": 0.00010926406685030088, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/mean_length": 378.8125, "completions/min_length": 327.0, "epoch": 6.488235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01133337989449501, "kl": 0.007296306546777487, "learning_rate": 8.542786060293241e-07, "loss": 7.34688073862344e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 492.5625, "completions/min_length": 403.0, "epoch": 6.489705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.0500831604003906, "kl": 0.010323435417376459, "learning_rate": 8.54188035723568e-07, "loss": 0.00010397844016551971, "reward": 0.6875, "reward_std": 0.19594095647335052, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 482.6875, "completions/min_length": 421.0, "epoch": 6.491176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.014845846220850945, "kl": 0.007767230155877769, "learning_rate": 8.540974420848003e-07, "loss": 7.74441214161925e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 460.0625, "completions/min_length": 366.0, "epoch": 6.492647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.014247074723243713, "kl": 0.009326840983703732, "learning_rate": 8.540068251189893e-07, "loss": 9.309285087510943e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 475.5, "completions/min_length": 420.0, "epoch": 6.4941176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.7279250621795654, "kl": 0.01288149249739945, "learning_rate": 8.539161848321045e-07, "loss": 0.00012984871864318848, "reward": 0.25058335065841675, "reward_std": 0.1931990385055542, "rewards/DrugCombAccuracyCOTORM/mean": 0.09916666895151138, "rewards/DrugCombAccuracyCOTORM/std": 0.25310954451560974, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7124999761581421, "rewards/DrugCombCoverageCOTORM/std": 0.3124277889728546, "step": 4416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 426.5, "completions/min_length": 373.0, "epoch": 6.495588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.0067756278440356255, "kl": 0.006434968905523419, "learning_rate": 8.538255212301171e-07, "loss": 6.419629789888859e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 436.875, "completions/min_length": 377.0, "epoch": 6.497058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.010747998021543026, "kl": 0.007231088005937636, "learning_rate": 8.537348343189996e-07, "loss": 7.24001438356936e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 438.625, "completions/min_length": 394.0, "epoch": 6.498529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.010945174843072891, "kl": 0.008214774657972157, "learning_rate": 8.536441241047268e-07, "loss": 8.229014929383993e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 490.1875, "completions/min_length": 425.0, "epoch": 6.5, "frac_reward_zero_std": 0.5, "grad_norm": 0.9143452048301697, "kl": 0.009765773080289364, "learning_rate": 8.535533905932737e-07, "loss": 9.700260125100613e-05, "reward": 0.5551249980926514, "reward_std": 0.12692566215991974, "rewards/DrugCombAccuracyCOTORM/mean": 0.4478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.5045558214187622, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 4420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 430.6875, "completions/min_length": 368.0, "epoch": 6.501470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.010758720338344574, "kl": 0.007913887267932296, "learning_rate": 8.534626337906182e-07, "loss": 7.940847717691213e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 431.0625, "completions/min_length": 349.0, "epoch": 6.502941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.07240501046180725, "kl": 0.008838688721880317, "learning_rate": 8.533718537027388e-07, "loss": 8.74745674082078e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/mean_length": 485.375, "completions/min_length": 390.0, "epoch": 6.504411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.017306003719568253, "kl": 0.019935489865019917, "learning_rate": 8.532810503356159e-07, "loss": 0.00020058901282027364, "reward": 0.7016666531562805, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.637499988079071, "rewards/DrugCombAccuracyCOTORM/std": 0.3743883967399597, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.08606630563735962, "step": 4423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 442.1875, "completions/min_length": 350.0, "epoch": 6.5058823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.1951391696929932, "kl": 0.01037983549758792, "learning_rate": 8.531902236952316e-07, "loss": 0.00010404667409602553, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 4424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 458.375, "completions/min_length": 398.0, "epoch": 6.507352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.010481549426913261, "kl": 0.006970589980483055, "learning_rate": 8.530993737875689e-07, "loss": 7.021486817393452e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 428.0625, "completions/min_length": 401.0, "epoch": 6.508823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.002982258796692, "kl": 0.010330938268452883, "learning_rate": 8.530085006186132e-07, "loss": 0.00010413896234240383, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 4426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/mean_length": 400.5, "completions/min_length": 329.0, "epoch": 6.510294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.012600057758390903, "kl": 0.00851271627470851, "learning_rate": 8.529176041943509e-07, "loss": 8.549352060072124e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 468.8125, "completions/min_length": 429.0, "epoch": 6.511764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.6140906810760498, "kl": 0.010396841214969754, "learning_rate": 8.528266845207699e-07, "loss": 0.00010389089584350586, "reward": 0.800000011920929, "reward_std": 0.3484410345554352, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 434.8125, "completions/min_length": 394.0, "epoch": 6.5132352941176475, "frac_reward_zero_std": 1.0, "grad_norm": 0.020616833120584488, "kl": 0.00929067237302661, "learning_rate": 8.527357416038598e-07, "loss": 9.359247633256018e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 452.375, "completions/min_length": 397.0, "epoch": 6.514705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 1.3873270750045776, "kl": 0.009263335727155209, "learning_rate": 8.526447754496117e-07, "loss": 9.284913539886475e-05, "reward": 0.4749999940395355, "reward_std": 0.3962059020996094, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.25, "rewards/DrugCombCoverageCOTORM/std": 0.8563488721847534, "step": 4430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 465.0625, "completions/min_length": 432.0, "epoch": 6.516176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.9653951525688171, "kl": 0.010811446816660464, "learning_rate": 8.525537860640182e-07, "loss": 0.00010795146226882935, "reward": 0.9387716054916382, "reward_std": 0.13361641764640808, "rewards/DrugCombAccuracyCOTORM/mean": 0.9281519651412964, "rewards/DrugCombAccuracyCOTORM/std": 0.22362330555915833, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9624999761581421, "rewards/DrugCombCoverageCOTORM/std": 0.15000000596046448, "step": 4431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 456.0625, "completions/min_length": 418.0, "epoch": 6.517647058823529, "frac_reward_zero_std": 0.0, "grad_norm": 1.9214526414871216, "kl": 0.011935758404433727, "learning_rate": 8.524627734530736e-07, "loss": 0.00012072920799255371, "reward": 0.35468751192092896, "reward_std": 0.22027793526649475, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0625, "rewards/DrugCombCoverageCOTORM/std": 0.9979145526885986, "step": 4432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 454.375, "completions/min_length": 392.0, "epoch": 6.519117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.008028457872569561, "kl": 0.007120100082829595, "learning_rate": 8.523717376227733e-07, "loss": 7.133924373192713e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 456.5, "completions/min_length": 406.0, "epoch": 6.520588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.6043115854263306, "kl": 0.00864335591904819, "learning_rate": 8.522806785791148e-07, "loss": 8.67694616317749e-05, "reward": 0.41875001788139343, "reward_std": 0.2682524621486664, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.6020797491073608, "step": 4434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 490.875, "completions/min_length": 408.0, "epoch": 6.522058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.4401060342788696, "kl": 0.011091098887845874, "learning_rate": 8.521895963280967e-07, "loss": 0.00011200830340385437, "reward": 0.7723214626312256, "reward_std": 0.33166471123695374, "rewards/DrugCombAccuracyCOTORM/mean": 0.7440476417541504, "rewards/DrugCombAccuracyCOTORM/std": 0.3964601457118988, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7708333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.3890872597694397, "step": 4435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 483.4375, "completions/min_length": 435.0, "epoch": 6.523529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.015161438845098019, "kl": 0.00941033975686878, "learning_rate": 8.520984908757192e-07, "loss": 9.417466935701668e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 4436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 425.625, "completions/min_length": 372.0, "epoch": 6.525, "frac_reward_zero_std": 1.0, "grad_norm": 0.015746358782052994, "kl": 0.008211017004214227, "learning_rate": 8.520073622279842e-07, "loss": 8.09674384072423e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 434.5625, "completions/min_length": 380.0, "epoch": 6.526470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.012130193412303925, "kl": 0.009879593504592776, "learning_rate": 8.51916210390895e-07, "loss": 9.85147271421738e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 488.25, "completions/min_length": 448.0, "epoch": 6.527941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 0.9089279174804688, "kl": 0.016146253561601043, "learning_rate": 8.518250353704563e-07, "loss": 0.00016479167970828712, "reward": 0.6380833387374878, "reward_std": 0.15411660075187683, "rewards/DrugCombAccuracyCOTORM/mean": 0.5762500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.49902406334877014, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7708333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5404902100563049, "step": 4439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 431.5, "completions/min_length": 379.0, "epoch": 6.529411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.01671418733894825, "kl": 0.008810610044747591, "learning_rate": 8.517338371726749e-07, "loss": 8.785826503299177e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 444.25, "completions/min_length": 384.0, "epoch": 6.530882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.056487798690796, "kl": 0.010778024909086525, "learning_rate": 8.516426158035583e-07, "loss": 0.00010747488704510033, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 422.5625, "completions/min_length": 311.0, "epoch": 6.5323529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.009272809140384197, "kl": 0.0086442792089656, "learning_rate": 8.51551371269116e-07, "loss": 8.637442806502804e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 471.9375, "completions/min_length": 359.0, "epoch": 6.533823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.018786873668432236, "kl": 0.0073526386404410005, "learning_rate": 8.514601035753591e-07, "loss": 7.176799408625811e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 424.25, "completions/min_length": 377.0, "epoch": 6.535294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.046985920518636703, "kl": 0.012575135100632906, "learning_rate": 8.513688127283001e-07, "loss": 0.00012229708954691887, "reward": 0.6713333129882812, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6100000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.40279027819633484, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 4444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/mean_length": 533.875, "completions/min_length": 472.0, "epoch": 6.536764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.6338704824447632, "kl": 0.022134700091555715, "learning_rate": 8.512774987339528e-07, "loss": 0.00022573769092559814, "reward": 0.8286666870117188, "reward_std": 0.2832164764404297, "rewards/DrugCombAccuracyCOTORM/mean": 0.8105729222297668, "rewards/DrugCombAccuracyCOTORM/std": 0.32314005494117737, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8020833730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5135884881019592, "step": 4445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 474.625, "completions/min_length": 399.0, "epoch": 6.538235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0388736724853516, "kl": 0.007861947175115347, "learning_rate": 8.511861615983329e-07, "loss": 7.890164852142334e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 454.25, "completions/min_length": 423.0, "epoch": 6.5397058823529415, "frac_reward_zero_std": 0.0, "grad_norm": 1.6573811769485474, "kl": 0.0121618767734617, "learning_rate": 8.510948013274574e-07, "loss": 0.00012096762657165527, "reward": 0.9089166522026062, "reward_std": 0.2576225697994232, "rewards/DrugCombAccuracyCOTORM/mean": 0.8887500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.30663496255874634, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 4447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 416.75, "completions/min_length": 349.0, "epoch": 6.541176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.0671069473028183, "kl": 0.010273829801008105, "learning_rate": 8.510034179273449e-07, "loss": 0.00010302712325938046, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 462.5625, "completions/min_length": 406.0, "epoch": 6.54264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.028795765712857246, "kl": 0.011569920694455504, "learning_rate": 8.509120114040155e-07, "loss": 0.00011671608808683231, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 461.4375, "completions/min_length": 426.0, "epoch": 6.544117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.830209493637085, "kl": 0.0082335714250803, "learning_rate": 8.508205817634908e-07, "loss": 8.259834430646151e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/mean_length": 472.75, "completions/min_length": 396.0, "epoch": 6.545588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.1709339618682861, "kl": 0.013533667428418994, "learning_rate": 8.507291290117939e-07, "loss": 0.00013430416584014893, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 427.75, "completions/min_length": 350.0, "epoch": 6.547058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.4807350635528564, "kl": 0.010819230112247169, "learning_rate": 8.506376531549498e-07, "loss": 0.00010758638381958008, "reward": 0.824999988079071, "reward_std": 0.38508620858192444, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.5773502588272095, "step": 4452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/mean_length": 403.3125, "completions/min_length": 359.0, "epoch": 6.548529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.053713358938694, "kl": 0.010515301139093935, "learning_rate": 8.505461541989844e-07, "loss": 0.00010464305523782969, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/mean_length": 494.0, "completions/min_length": 414.0, "epoch": 6.55, "frac_reward_zero_std": 0.5, "grad_norm": 0.887800931930542, "kl": 0.008501048549078405, "learning_rate": 8.504546321499254e-07, "loss": 8.524581789970398e-05, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 4454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/mean_length": 524.0625, "completions/min_length": 481.0, "epoch": 6.551470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.507941722869873, "kl": 0.01157313515432179, "learning_rate": 8.503630870138022e-07, "loss": 0.00011532008647918701, "reward": 0.20737500488758087, "reward_std": 0.13854119181632996, "rewards/DrugCombAccuracyCOTORM/mean": 0.1120833307504654, "rewards/DrugCombAccuracyCOTORM/std": 0.19866545498371124, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.1770833283662796, "rewards/DrugCombCoverageCOTORM/std": 0.4491504430770874, "step": 4455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 478.1875, "completions/min_length": 441.0, "epoch": 6.552941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.025757791474461555, "kl": 0.009276565862819552, "learning_rate": 8.502715187966453e-07, "loss": 9.313134069088846e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 502.3125, "completions/min_length": 441.0, "epoch": 6.554411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.820766270160675, "kl": 0.008599941153079271, "learning_rate": 8.501799275044874e-07, "loss": 8.64435060066171e-05, "reward": 0.3395833373069763, "reward_std": 0.02509901113808155, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3958333432674408, "rewards/DrugCombCoverageCOTORM/std": 0.3489401936531067, "step": 4457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 433.1875, "completions/min_length": 371.0, "epoch": 6.555882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.9725432395935059, "kl": 0.007423049886710942, "learning_rate": 8.50088313143362e-07, "loss": 7.413777348119766e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 420.375, "completions/min_length": 350.0, "epoch": 6.557352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.014033995568752289, "kl": 0.00705481437034905, "learning_rate": 8.499966757193045e-07, "loss": 7.075312896631658e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 411.625, "completions/min_length": 338.0, "epoch": 6.5588235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 0.8510943651199341, "kl": 0.010508720763027668, "learning_rate": 8.499050152383518e-07, "loss": 0.00010344386100769043, "reward": 0.885937511920929, "reward_std": 0.2112291157245636, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 4460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/mean_length": 488.4375, "completions/min_length": 389.0, "epoch": 6.560294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.0650578737258911, "kl": 0.010178406722843647, "learning_rate": 8.498133317065421e-07, "loss": 0.00010145650594495237, "reward": 0.9233958721160889, "reward_std": 0.1372954249382019, "rewards/DrugCombAccuracyCOTORM/mean": 0.9127083420753479, "rewards/DrugCombAccuracyCOTORM/std": 0.23072902858257294, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9322916269302368, "rewards/DrugCombCoverageCOTORM/std": 0.190561443567276, "step": 4461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/mean_length": 404.0, "completions/min_length": 360.0, "epoch": 6.561764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.01067565381526947, "kl": 0.00647257745731622, "learning_rate": 8.497216251299155e-07, "loss": 6.485059566330165e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 503.25, "completions/min_length": 444.0, "epoch": 6.563235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.9905987977981567, "kl": 0.007702972157858312, "learning_rate": 8.49629895514513e-07, "loss": 7.656961679458618e-05, "reward": 0.6218750476837158, "reward_std": 0.22097086906433105, "rewards/DrugCombAccuracyCOTORM/mean": 0.53125, "rewards/DrugCombAccuracyCOTORM/std": 0.4989572763442993, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 4463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/mean_length": 508.5, "completions/min_length": 427.0, "epoch": 6.564705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9146772027015686, "kl": 0.011044728104025126, "learning_rate": 8.49538142866378e-07, "loss": 0.00010973031021421775, "reward": 0.6732927560806274, "reward_std": 0.13589002192020416, "rewards/DrugCombAccuracyCOTORM/mean": 0.6145325899124146, "rewards/DrugCombAccuracyCOTORM/std": 0.45480239391326904, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8166666626930237, "rewards/DrugCombCoverageCOTORM/std": 0.25473296642303467, "step": 4464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 478.6875, "completions/min_length": 460.0, "epoch": 6.5661764705882355, "frac_reward_zero_std": 0.5, "grad_norm": 0.8519043326377869, "kl": 0.009987418772652745, "learning_rate": 8.494463671915546e-07, "loss": 9.984523057937622e-05, "reward": 0.9052500128746033, "reward_std": 0.17580163478851318, "rewards/DrugCombAccuracyCOTORM/mean": 0.8841666579246521, "rewards/DrugCombAccuracyCOTORM/std": 0.3176476061344147, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 4465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 469.1875, "completions/min_length": 413.0, "epoch": 6.567647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.0555299520492554, "kl": 0.00890992279164493, "learning_rate": 8.49354568496089e-07, "loss": 8.989125490188599e-05, "reward": 0.824999988079071, "reward_std": 0.19820626080036163, "rewards/DrugCombAccuracyCOTORM/mean": 0.78125, "rewards/DrugCombAccuracyCOTORM/std": 0.4069705307483673, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 426.9375, "completions/min_length": 387.0, "epoch": 6.569117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.0201761722564697, "kl": 0.008936557569541037, "learning_rate": 8.492627467860285e-07, "loss": 8.956903184298426e-05, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 4467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 422.75, "completions/min_length": 368.0, "epoch": 6.570588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.015275726094841957, "kl": 0.008487282088026404, "learning_rate": 8.49170902067422e-07, "loss": 8.558800618629903e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 438.875, "completions/min_length": 383.0, "epoch": 6.572058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.0068038702011108, "kl": 0.011975576635450125, "learning_rate": 8.490790343463202e-07, "loss": 0.00011840462684631348, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 438.9375, "completions/min_length": 391.0, "epoch": 6.573529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.014503523707389832, "kl": 0.009078183211386204, "learning_rate": 8.489871436287751e-07, "loss": 9.135359869105741e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/mean_length": 508.625, "completions/min_length": 416.0, "epoch": 6.575, "frac_reward_zero_std": 0.5, "grad_norm": 0.8279150724411011, "kl": 0.008397468482144177, "learning_rate": 8.488952299208401e-07, "loss": 8.433334005530924e-05, "reward": 0.893750011920929, "reward_std": 0.06557891517877579, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.1666666567325592, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.08333335071802139, "step": 4471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 414.4375, "completions/min_length": 377.0, "epoch": 6.576470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.681715726852417, "kl": 0.011755096726119518, "learning_rate": 8.488032932285703e-07, "loss": 0.00011823336535599083, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/mean_length": 515.5, "completions/min_length": 425.0, "epoch": 6.577941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.0344206094741821, "kl": 0.010307156015187502, "learning_rate": 8.487113335580222e-07, "loss": 0.00010399016900919378, "reward": 0.4749999940395355, "reward_std": 0.1035098284482956, "rewards/DrugCombAccuracyCOTORM/mean": 0.34375, "rewards/DrugCombAccuracyCOTORM/std": 0.3966001570224762, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 414.875, "completions/min_length": 382.0, "epoch": 6.579411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.015923932194709778, "kl": 0.008957525016739964, "learning_rate": 8.486193509152539e-07, "loss": 9.006142499856651e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 474.625, "completions/min_length": 414.0, "epoch": 6.580882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.1158714294433594, "kl": 0.01393562275916338, "learning_rate": 8.485273453063251e-07, "loss": 0.00014087285671848804, "reward": 0.753125011920929, "reward_std": 0.21961988508701324, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.394405335187912, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.53125, "rewards/DrugCombCoverageCOTORM/std": 0.7846177816390991, "step": 4475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 425.125, "completions/min_length": 379.0, "epoch": 6.58235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.021649476140737534, "kl": 0.00683483318425715, "learning_rate": 8.484353167372968e-07, "loss": 6.850843783468008e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 436.1875, "completions/min_length": 388.0, "epoch": 6.583823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 653.6204223632812, "kl": 6.383777350420132, "learning_rate": 8.483432652142315e-07, "loss": 0.06597132235765457, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 475.125, "completions/min_length": 359.0, "epoch": 6.5852941176470585, "frac_reward_zero_std": 0.5, "grad_norm": 0.9525286555290222, "kl": 0.009723563911393285, "learning_rate": 8.482511907431936e-07, "loss": 9.70698893070221e-05, "reward": 0.6830500364303589, "reward_std": 0.07751783728599548, "rewards/DrugCombAccuracyCOTORM/mean": 0.622041642665863, "rewards/DrugCombAccuracyCOTORM/std": 0.4100590944290161, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.17612075805664062, "step": 4478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 460.625, "completions/min_length": 392.0, "epoch": 6.586764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.2414990663528442, "kl": 0.0078184426529333, "learning_rate": 8.481590933302484e-07, "loss": 7.810123497620225e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 439.5, "completions/min_length": 395.0, "epoch": 6.588235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01483377069234848, "kl": 0.00847098685335368, "learning_rate": 8.480669729814633e-07, "loss": 8.494561916450039e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 452.3125, "completions/min_length": 377.0, "epoch": 6.589705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.008090727962553501, "kl": 0.006953521980904043, "learning_rate": 8.479748297029068e-07, "loss": 6.954812124604359e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 487.875, "completions/min_length": 405.0, "epoch": 6.591176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9574798941612244, "kl": 0.009304769453592598, "learning_rate": 8.478826635006493e-07, "loss": 9.358787792734802e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 451.625, "completions/min_length": 406.0, "epoch": 6.5926470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.02238297089934349, "kl": 0.009726950200274587, "learning_rate": 8.477904743807621e-07, "loss": 9.702145325718448e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 447.0, "completions/min_length": 389.0, "epoch": 6.594117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.016643622890114784, "kl": 0.008143319748342037, "learning_rate": 8.476982623493187e-07, "loss": 8.107785833999515e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 439.625, "completions/min_length": 391.0, "epoch": 6.595588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.01688726618885994, "kl": 0.009425551746971905, "learning_rate": 8.476060274123938e-07, "loss": 9.355811926070601e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 452.375, "completions/min_length": 415.0, "epoch": 6.597058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.010634050704538822, "kl": 0.007023723097518086, "learning_rate": 8.475137695760632e-07, "loss": 6.986632070038468e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 488.4375, "completions/min_length": 411.0, "epoch": 6.598529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8980143070220947, "kl": 0.009792958851903677, "learning_rate": 8.474214888464051e-07, "loss": 9.816895180847496e-05, "reward": 0.5249999761581421, "reward_std": 0.04629100486636162, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.25, "rewards/DrugCombCoverageCOTORM/std": 1.0, "step": 4487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 480.8125, "completions/min_length": 411.0, "epoch": 6.6, "frac_reward_zero_std": 0.5, "grad_norm": 1.0098035335540771, "kl": 0.010612469050101936, "learning_rate": 8.473291852294986e-07, "loss": 0.00010625272989273071, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/mean_length": 471.25, "completions/min_length": 429.0, "epoch": 6.601470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.5478068590164185, "kl": 0.01409722026437521, "learning_rate": 8.472368587314243e-07, "loss": 0.00013954192399978638, "reward": 0.44999998807907104, "reward_std": 0.39218372106552124, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 452.875, "completions/min_length": 399.0, "epoch": 6.602941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.0621873140335083, "kl": 0.010386849753558636, "learning_rate": 8.471445093582645e-07, "loss": 0.00010404330532765016, "reward": 0.75, "reward_std": 0.26726123690605164, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 4490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 432.1875, "completions/min_length": 373.0, "epoch": 6.604411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.016035960987210274, "kl": 0.00880014628637582, "learning_rate": 8.470521371161029e-07, "loss": 8.872802573023364e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 472.75, "completions/min_length": 395.0, "epoch": 6.605882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.9324538111686707, "kl": 0.010537673719227314, "learning_rate": 8.469597420110248e-07, "loss": 0.0001050798746291548, "reward": 0.893750011920929, "reward_std": 0.06557891517877579, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.1666666567325592, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.08333335071802139, "step": 4492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/mean_length": 505.6875, "completions/min_length": 374.0, "epoch": 6.607352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.017944348976016045, "kl": 0.006944479770027101, "learning_rate": 8.468673240491168e-07, "loss": 7.077043846948072e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 475.625, "completions/min_length": 441.0, "epoch": 6.608823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.1396046876907349, "kl": 0.009651439031586051, "learning_rate": 8.467748832364676e-07, "loss": 9.495319682173431e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 458.875, "completions/min_length": 411.0, "epoch": 6.610294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.1768760681152344, "kl": 0.013147904770448804, "learning_rate": 8.466824195791664e-07, "loss": 0.00013241171836853027, "reward": 0.9552083015441895, "reward_std": 0.08368229866027832, "rewards/DrugCombAccuracyCOTORM/mean": 0.9479166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.145535409450531, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 4495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.0, "completions/mean_length": 504.625, "completions/min_length": 355.0, "epoch": 6.6117647058823525, "frac_reward_zero_std": 0.5, "grad_norm": 0.9715421199798584, "kl": 0.009736022679135203, "learning_rate": 8.465899330833049e-07, "loss": 9.684637188911438e-05, "reward": 0.7171875238418579, "reward_std": 0.23422911763191223, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 4496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 455.875, "completions/min_length": 406.0, "epoch": 6.613235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 2.838017702102661, "kl": 0.02269688341766596, "learning_rate": 8.464974237549758e-07, "loss": 0.0002165212936233729, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/mean_length": 522.375, "completions/min_length": 426.0, "epoch": 6.614705882352942, "frac_reward_zero_std": 0.0, "grad_norm": 1.7147040367126465, "kl": 0.009238908533006907, "learning_rate": 8.464048916002731e-07, "loss": 9.309500455856323e-05, "reward": 0.49863335490226746, "reward_std": 0.06755366921424866, "rewards/DrugCombAccuracyCOTORM/mean": 0.40454167127609253, "rewards/DrugCombAccuracyCOTORM/std": 0.31157562136650085, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.08606628328561783, "step": 4498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 406.3125, "completions/min_length": 337.0, "epoch": 6.616176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.012791348621249199, "kl": 0.008255573571659625, "learning_rate": 8.463123366252929e-07, "loss": 8.274029096355662e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 496.4375, "completions/min_length": 414.0, "epoch": 6.617647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.9567224979400635, "kl": 0.0072919526137411594, "learning_rate": 8.462197588361323e-07, "loss": 7.21365213394165e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 437.0, "completions/min_length": 381.0, "epoch": 6.6191176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.9659719467163086, "kl": 0.007382527808658779, "learning_rate": 8.461271582388902e-07, "loss": 7.413227285724133e-05, "reward": 0.574999988079071, "reward_std": 0.04629100486636162, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 4501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 416.1875, "completions/min_length": 334.0, "epoch": 6.620588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.1438164710998535, "kl": 0.010607018950395286, "learning_rate": 8.460345348396667e-07, "loss": 0.00010582702816464007, "reward": 0.8767499923706055, "reward_std": 0.17010116577148438, "rewards/DrugCombAccuracyCOTORM/mean": 0.8537499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.31442803144454956, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.13437095284461975, "step": 4502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 439.3125, "completions/min_length": 370.0, "epoch": 6.622058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.6253107786178589, "kl": 0.009691941435448825, "learning_rate": 8.459418886445639e-07, "loss": 9.658187627792358e-05, "reward": 0.8479166626930237, "reward_std": 0.3324779272079468, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 4503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 515.0625, "completions/min_length": 454.0, "epoch": 6.623529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.4211772680282593, "kl": 0.014713007723912597, "learning_rate": 8.458492196596851e-07, "loss": 0.00014457851648330688, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 410.3125, "completions/min_length": 367.0, "epoch": 6.625, "frac_reward_zero_std": 1.0, "grad_norm": 0.009347088634967804, "kl": 0.005947490804828703, "learning_rate": 8.457565278911347e-07, "loss": 5.919732211623341e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 441.5, "completions/min_length": 399.0, "epoch": 6.626470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.020615151152014732, "kl": 0.00835713988635689, "learning_rate": 8.456638133450193e-07, "loss": 8.313407306559384e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 458.0, "completions/min_length": 429.0, "epoch": 6.627941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.2666287422180176, "kl": 0.009836334036663175, "learning_rate": 8.455710760274466e-07, "loss": 9.907782077789307e-05, "reward": 0.5125000476837158, "reward_std": 0.4153292179107666, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 785.0, "completions/mean_length": 599.0, "completions/min_length": 486.0, "epoch": 6.629411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.8585752248764038, "kl": 0.007437282591126859, "learning_rate": 8.45478315944526e-07, "loss": 7.471870048902929e-05, "reward": 0.5784722566604614, "reward_std": 0.032959211617708206, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7847222089767456, "rewards/DrugCombCoverageCOTORM/std": 0.5022070407867432, "step": 4508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 479.6875, "completions/min_length": 422.0, "epoch": 6.6308823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.01678472012281418, "kl": 0.011277594836428761, "learning_rate": 8.453855331023683e-07, "loss": 0.00011368829291313887, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 4509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/mean_length": 421.0, "completions/min_length": 374.0, "epoch": 6.632352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.01566898077726364, "kl": 0.009227520902641118, "learning_rate": 8.452927275070857e-07, "loss": 9.18938749236986e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 473.6875, "completions/min_length": 441.0, "epoch": 6.633823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.7708247900009155, "kl": 0.008921397500671446, "learning_rate": 8.45199899164792e-07, "loss": 8.88928771018982e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 439.6875, "completions/min_length": 362.0, "epoch": 6.635294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 1.1192283630371094, "kl": 0.008482349920086563, "learning_rate": 8.451070480816026e-07, "loss": 8.56125625432469e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 459.5625, "completions/min_length": 400.0, "epoch": 6.636764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.360958218574524, "kl": 0.011015272932127118, "learning_rate": 8.450141742636342e-07, "loss": 0.0001099705696105957, "reward": 0.8422499895095825, "reward_std": 0.3041592538356781, "rewards/DrugCombAccuracyCOTORM/mean": 0.8054167032241821, "rewards/DrugCombAccuracyCOTORM/std": 0.37538498640060425, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 4513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/mean_length": 541.625, "completions/min_length": 468.0, "epoch": 6.6382352941176475, "frac_reward_zero_std": 0.5, "grad_norm": 0.8139605522155762, "kl": 0.00859214155934751, "learning_rate": 8.449212777170052e-07, "loss": 8.588238415541127e-05, "reward": 0.484375, "reward_std": 0.21409589052200317, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 0.9375, "rewards/DrugCombCOTFormatORM/std": 0.17078252136707306, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 4514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 473.5, "completions/min_length": 422.0, "epoch": 6.639705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.0380504131317139, "kl": 0.010837842943146825, "learning_rate": 8.448283584478352e-07, "loss": 0.00010813245899043977, "reward": 0.5, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 452.9375, "completions/min_length": 403.0, "epoch": 6.641176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.5713065266609192, "kl": 0.00858837435953319, "learning_rate": 8.447354164622455e-07, "loss": 8.568167686462402e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/mean_length": 492.25, "completions/min_length": 415.0, "epoch": 6.642647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.942948579788208, "kl": 0.009068212937563658, "learning_rate": 8.446424517663591e-07, "loss": 9.118326124735177e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/mean_length": 499.5625, "completions/min_length": 417.0, "epoch": 6.644117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 1.3309965133666992, "kl": 0.010284762131050229, "learning_rate": 8.445494643663001e-07, "loss": 0.00010196864604949951, "reward": 0.7427083253860474, "reward_std": 0.3896467089653015, "rewards/DrugCombAccuracyCOTORM/mean": 0.7291666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.4425306022167206, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.59375, "rewards/DrugCombCoverageCOTORM/std": 0.8003905415534973, "step": 4518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 461.5, "completions/min_length": 425.0, "epoch": 6.645588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.011388405226171017, "kl": 0.006412268383428454, "learning_rate": 8.444564542681945e-07, "loss": 6.449102511396632e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 464.9375, "completions/min_length": 380.0, "epoch": 6.647058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.1402653455734253, "kl": 0.010551163461059332, "learning_rate": 8.443634214781693e-07, "loss": 0.00010489375563338399, "reward": 0.75, "reward_std": 0.26726123690605164, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 4520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 425.0625, "completions/min_length": 347.0, "epoch": 6.648529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.7294374704360962, "kl": 0.00841597851831466, "learning_rate": 8.442703660023534e-07, "loss": 8.478760719299316e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/mean_length": 416.9375, "completions/min_length": 376.0, "epoch": 6.65, "frac_reward_zero_std": 1.0, "grad_norm": 0.015490420162677765, "kl": 0.008628611802123487, "learning_rate": 8.441772878468769e-07, "loss": 8.629474905319512e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 479.75, "completions/min_length": 412.0, "epoch": 6.651470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8167789578437805, "kl": 0.008522157557308674, "learning_rate": 8.440841870178719e-07, "loss": 8.577903645345941e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 4523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 458.1875, "completions/min_length": 394.0, "epoch": 6.652941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 1.1261851787567139, "kl": 0.009843466337770224, "learning_rate": 8.439910635214712e-07, "loss": 9.977351874113083e-05, "reward": 0.9089166522026062, "reward_std": 0.16972768306732178, "rewards/DrugCombAccuracyCOTORM/mean": 0.8887500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.30663496255874634, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 4524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 460.1875, "completions/min_length": 412.0, "epoch": 6.654411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 6.779017448425293, "kl": 0.011644725338555872, "learning_rate": 8.438979173638099e-07, "loss": 0.00011579499550862238, "reward": 0.7749999761581421, "reward_std": 0.24348658323287964, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 4525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 444.8125, "completions/min_length": 398.0, "epoch": 6.655882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.010580605827271938, "kl": 0.00771053007338196, "learning_rate": 8.43804748551024e-07, "loss": 7.69118414609693e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 478.9375, "completions/min_length": 378.0, "epoch": 6.6573529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.9480609893798828, "kl": 0.007629561587236822, "learning_rate": 8.437115570892515e-07, "loss": 7.738812564639375e-05, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 4527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 450.8125, "completions/min_length": 404.0, "epoch": 6.658823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.014990556053817272, "kl": 0.00890675827395171, "learning_rate": 8.436183429846313e-07, "loss": 8.868308941600844e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 426.5625, "completions/min_length": 362.0, "epoch": 6.660294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.917925238609314, "kl": 0.008202378754504025, "learning_rate": 8.435251062433044e-07, "loss": 8.219637675210834e-05, "reward": 0.824999988079071, "reward_std": 0.24348658323287964, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 4529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 440.5, "completions/min_length": 385.0, "epoch": 6.661764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0975117683410645, "kl": 0.012043338501825929, "learning_rate": 8.434318468714126e-07, "loss": 0.0001207670065923594, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 486.5, "completions/min_length": 388.0, "epoch": 6.663235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01100051961839199, "kl": 0.0062661918345838785, "learning_rate": 8.433385648751001e-07, "loss": 6.23066516709514e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 440.25, "completions/min_length": 373.0, "epoch": 6.6647058823529415, "frac_reward_zero_std": 0.5, "grad_norm": 1.3261475563049316, "kl": 0.010295516112819314, "learning_rate": 8.432452602605116e-07, "loss": 0.00010291360376868397, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 442.5, "completions/min_length": 375.0, "epoch": 6.666176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.014526070095598698, "kl": 0.007941464195027947, "learning_rate": 8.431519330337941e-07, "loss": 7.929959974717349e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 504.6875, "completions/min_length": 451.0, "epoch": 6.66764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.060630440711975, "kl": 0.011336821829900146, "learning_rate": 8.430585832010958e-07, "loss": 0.00011226534843444824, "reward": 0.8188124895095825, "reward_std": 0.20012685656547546, "rewards/DrugCombAccuracyCOTORM/mean": 0.8054167032241821, "rewards/DrugCombAccuracyCOTORM/std": 0.3473238945007324, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7604166269302368, "rewards/DrugCombCoverageCOTORM/std": 0.54081130027771, "step": 4534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 454.375, "completions/min_length": 399.0, "epoch": 6.669117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.012709387578070164, "kl": 0.006911051692441106, "learning_rate": 8.429652107685661e-07, "loss": 6.936631689313799e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 429.1875, "completions/min_length": 376.0, "epoch": 6.670588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.013763343915343285, "kl": 0.009010758833028376, "learning_rate": 8.428718157423562e-07, "loss": 8.95396078703925e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 469.625, "completions/min_length": 412.0, "epoch": 6.672058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.9142847657203674, "kl": 0.01020558294840157, "learning_rate": 8.42778398128619e-07, "loss": 0.00010429772373754531, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 4537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 446.1875, "completions/min_length": 347.0, "epoch": 6.673529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.028092587366700172, "kl": 0.010617393301799893, "learning_rate": 8.42684957933508e-07, "loss": 0.00010671160271158442, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 454.9375, "completions/min_length": 406.0, "epoch": 6.675, "frac_reward_zero_std": 0.5, "grad_norm": 0.8924751877784729, "kl": 0.01279271999374032, "learning_rate": 8.425914951631795e-07, "loss": 0.00012635657913051546, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 452.0, "completions/min_length": 406.0, "epoch": 6.676470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.106154203414917, "kl": 0.011033920105546713, "learning_rate": 8.424980098237902e-07, "loss": 0.00011117011308670044, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 500.75, "completions/min_length": 412.0, "epoch": 6.677941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.3349226713180542, "kl": 0.012592529528774321, "learning_rate": 8.424045019214988e-07, "loss": 0.00012600421905517578, "reward": 0.30000001192092896, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.1875, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 4541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 433.3125, "completions/min_length": 367.0, "epoch": 6.679411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.936599850654602, "kl": 0.007925266632810235, "learning_rate": 8.423109714624653e-07, "loss": 7.888476102380082e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 4542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/mean_length": 461.875, "completions/min_length": 394.0, "epoch": 6.680882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.8177085518836975, "kl": 0.018216540338471532, "learning_rate": 8.422174184528513e-07, "loss": 0.00017964803555514663, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 4543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 462.4375, "completions/min_length": 365.0, "epoch": 6.682352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.3285480737686157, "kl": 0.009027277235873044, "learning_rate": 8.421238428988197e-07, "loss": 8.963420987129211e-05, "reward": 0.5125000476837158, "reward_std": 0.4153292179107666, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 403.375, "completions/min_length": 335.0, "epoch": 6.6838235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.40662822127342224, "kl": 0.014855729416012764, "learning_rate": 8.420302448065353e-07, "loss": 0.0001459854538552463, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 441.4375, "completions/min_length": 412.0, "epoch": 6.685294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.008487890474498272, "kl": 0.006743980222381651, "learning_rate": 8.41936624182164e-07, "loss": 6.762352859368548e-05, "reward": 0.6713333129882812, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6100000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.40279027819633484, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 4546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 438.4375, "completions/min_length": 394.0, "epoch": 6.686764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.013187437318265438, "kl": 0.009167923242785037, "learning_rate": 8.41842981031873e-07, "loss": 9.162236528936774e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 478.125, "completions/min_length": 416.0, "epoch": 6.688235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1626031398773193, "kl": 0.008787300670519471, "learning_rate": 8.417493153618317e-07, "loss": 8.897483348846436e-05, "reward": 0.737500011920929, "reward_std": 0.219983771443367, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/mean_length": 569.5, "completions/min_length": 439.0, "epoch": 6.689705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 1.0543266534805298, "kl": 0.00677011301741004, "learning_rate": 8.416556271782102e-07, "loss": 6.835535168647766e-05, "reward": 0.5650333762168884, "reward_std": 0.30016425251960754, "rewards/DrugCombAccuracyCOTORM/mean": 0.4828541576862335, "rewards/DrugCombAccuracyCOTORM/std": 0.38222235441207886, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7875000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.21460558474063873, "step": 4549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 423.625, "completions/min_length": 368.0, "epoch": 6.6911764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.011666962876915932, "kl": 0.0077866013161838055, "learning_rate": 8.415619164871808e-07, "loss": 7.835679571144283e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 444.0, "completions/min_length": 368.0, "epoch": 6.692647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.010625188238918781, "kl": 0.008782198186963797, "learning_rate": 8.414681832949167e-07, "loss": 8.877479558577761e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 431.625, "completions/min_length": 372.0, "epoch": 6.694117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.018957555294036865, "kl": 0.00756350823212415, "learning_rate": 8.413744276075927e-07, "loss": 7.574456685688347e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/mean_length": 513.4375, "completions/min_length": 380.0, "epoch": 6.695588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.9310725927352905, "kl": 0.008543347008526325, "learning_rate": 8.412806494313853e-07, "loss": 8.4239240095485e-05, "reward": 0.8500000238418579, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 426.625, "completions/min_length": 352.0, "epoch": 6.697058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.274402379989624, "kl": 0.008015982806682587, "learning_rate": 8.411868487724725e-07, "loss": 7.999919034773484e-05, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 462.25, "completions/min_length": 385.0, "epoch": 6.698529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8973202109336853, "kl": 0.008420135243795812, "learning_rate": 8.410930256370336e-07, "loss": 8.254608110291883e-05, "reward": 0.6330000162124634, "reward_std": 0.16388672590255737, "rewards/DrugCombAccuracyCOTORM/mean": 0.6037499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.46738100051879883, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.7745966911315918, "step": 4555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 441.75, "completions/min_length": 339.0, "epoch": 6.7, "frac_reward_zero_std": 1.0, "grad_norm": 0.012261532247066498, "kl": 0.00851600372698158, "learning_rate": 8.409991800312492e-07, "loss": 8.518130925949663e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 4556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/mean_length": 486.125, "completions/min_length": 430.0, "epoch": 6.701470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8817306756973267, "kl": 0.009572508977726102, "learning_rate": 8.409053119613019e-07, "loss": 9.397239773534238e-05, "reward": 0.949999988079071, "reward_std": 0.09258200973272324, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.17078252136707306, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/mean_length": 485.5, "completions/min_length": 404.0, "epoch": 6.702941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.8602272272109985, "kl": 0.010401361156255007, "learning_rate": 8.408114214333753e-07, "loss": 0.0001046097168000415, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 4558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 472.0625, "completions/min_length": 416.0, "epoch": 6.704411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.6218143701553345, "kl": 0.011194254970178008, "learning_rate": 8.407175084536548e-07, "loss": 0.00011245906352996826, "reward": 0.6875, "reward_std": 0.3837963938713074, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/mean_length": 513.5625, "completions/min_length": 422.0, "epoch": 6.705882352941177, "frac_reward_zero_std": 0.0, "grad_norm": 1.4299644231796265, "kl": 0.011997533729299903, "learning_rate": 8.40623573028327e-07, "loss": 0.00012011826038360596, "reward": 0.5220242738723755, "reward_std": 0.366148442029953, "rewards/DrugCombAccuracyCOTORM/mean": 0.43534284830093384, "rewards/DrugCombAccuracyCOTORM/std": 0.4619502127170563, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.737500011920929, "rewards/DrugCombCoverageCOTORM/std": 0.3774917423725128, "step": 4560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 500.0, "completions/min_length": 407.0, "epoch": 6.70735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01621791534125805, "kl": 0.008482626872137189, "learning_rate": 8.405296151635804e-07, "loss": 8.467212319374084e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 447.0, "completions/min_length": 405.0, "epoch": 6.708823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.01214031595736742, "kl": 0.008356387377716601, "learning_rate": 8.404356348656043e-07, "loss": 8.364615496248007e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/mean_length": 473.6875, "completions/min_length": 385.0, "epoch": 6.7102941176470585, "frac_reward_zero_std": 0.5, "grad_norm": 1.765130877494812, "kl": 0.009301255806349218, "learning_rate": 8.403416321405903e-07, "loss": 9.384006261825562e-05, "reward": 0.9937499761581421, "reward_std": 0.017677659168839455, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 424.0, "completions/min_length": 386.0, "epoch": 6.711764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.0072654117830097675, "kl": 0.005400927271693945, "learning_rate": 8.402476069947307e-07, "loss": 5.426860661827959e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 469.875, "completions/min_length": 395.0, "epoch": 6.713235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.010563589632511139, "kl": 0.00908197381068021, "learning_rate": 8.401535594342199e-07, "loss": 9.07999201444909e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 428.6875, "completions/min_length": 348.0, "epoch": 6.714705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.03007498010993004, "kl": 0.01436188374646008, "learning_rate": 8.400594894652534e-07, "loss": 0.0001438018080079928, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 444.625, "completions/min_length": 371.0, "epoch": 6.716176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9134229421615601, "kl": 0.009521636879071593, "learning_rate": 8.399653970940283e-07, "loss": 9.488314390182495e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/mean_length": 456.125, "completions/min_length": 323.0, "epoch": 6.7176470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.02050202153623104, "kl": 0.008952143136411905, "learning_rate": 8.398712823267433e-07, "loss": 8.956329838838428e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 398.0625, "completions/min_length": 347.0, "epoch": 6.719117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.8358718156814575, "kl": 0.009321921737864614, "learning_rate": 8.397771451695983e-07, "loss": 9.30898095248267e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 464.5, "completions/min_length": 412.0, "epoch": 6.720588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.0146164894104004, "kl": 0.010958758648484945, "learning_rate": 8.396829856287948e-07, "loss": 0.00010873004794120789, "reward": 0.824999988079071, "reward_std": 0.24348656833171844, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 4570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 483.8125, "completions/min_length": 426.0, "epoch": 6.722058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.010566425509750843, "kl": 0.00724730605725199, "learning_rate": 8.395888037105358e-07, "loss": 7.227491005323827e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 420.5625, "completions/min_length": 344.0, "epoch": 6.723529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.013738488778471947, "kl": 0.009759061736986041, "learning_rate": 8.394945994210258e-07, "loss": 9.685280383564532e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/mean_length": 489.4375, "completions/min_length": 366.0, "epoch": 6.725, "frac_reward_zero_std": 0.5, "grad_norm": 0.8404383063316345, "kl": 0.009006829815916717, "learning_rate": 8.394003727664709e-07, "loss": 9.053259418578818e-05, "reward": 0.7760208249092102, "reward_std": 0.19966939091682434, "rewards/DrugCombAccuracyCOTORM/mean": 0.7213281393051147, "rewards/DrugCombAccuracyCOTORM/std": 0.44481632113456726, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9895833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.041666675359010696, "step": 4573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 466.6875, "completions/min_length": 388.0, "epoch": 6.726470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 1.8079981803894043, "kl": 0.04460126277990639, "learning_rate": 8.393061237530782e-07, "loss": 0.0004621601547114551, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 426.0625, "completions/min_length": 355.0, "epoch": 6.727941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.2180968523025513, "kl": 0.010901589645072818, "learning_rate": 8.392118523870568e-07, "loss": 0.00010911285789916292, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 440.9375, "completions/min_length": 376.0, "epoch": 6.729411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.012011689133942127, "kl": 0.0073821041733026505, "learning_rate": 8.39117558674617e-07, "loss": 7.318369171116501e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/mean_length": 495.875, "completions/min_length": 430.0, "epoch": 6.730882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.8525574207305908, "kl": 0.006501132505945861, "learning_rate": 8.390232426219705e-07, "loss": 6.501920142909512e-05, "reward": 0.9147999882698059, "reward_std": 0.15775975584983826, "rewards/DrugCombAccuracyCOTORM/mean": 0.8997499942779541, "rewards/DrugCombAccuracyCOTORM/std": 0.27393513917922974, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.949999988079071, "rewards/DrugCombCoverageCOTORM/std": 0.1366260051727295, "step": 4577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 453.375, "completions/min_length": 372.0, "epoch": 6.732352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 1.4128756523132324, "kl": 0.02777019946370274, "learning_rate": 8.389289042353309e-07, "loss": 0.0002811866579577327, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 455.8125, "completions/min_length": 379.0, "epoch": 6.733823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.0615512132644653, "kl": 0.011275439988821745, "learning_rate": 8.388345435209127e-07, "loss": 0.00011418014764785767, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 460.5, "completions/min_length": 409.0, "epoch": 6.735294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.009370753541588783, "kl": 0.006613850477151573, "learning_rate": 8.387401604849325e-07, "loss": 6.61883459542878e-05, "reward": 0.15000000596046448, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 4580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 471.3125, "completions/min_length": 394.0, "epoch": 6.7367647058823525, "frac_reward_zero_std": 0.5, "grad_norm": 1.0555574893951416, "kl": 0.010858192690648139, "learning_rate": 8.386457551336074e-07, "loss": 0.00010859304165933281, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/mean_length": 403.375, "completions/min_length": 362.0, "epoch": 6.738235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.09841052442789078, "kl": 0.011041710153222084, "learning_rate": 8.385513274731573e-07, "loss": 0.00011266162618994713, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 393.6875, "completions/min_length": 343.0, "epoch": 6.739705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.010772369801998138, "kl": 0.007752614270430058, "learning_rate": 8.384568775098024e-07, "loss": 7.828653906472027e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 475.5, "completions/min_length": 423.0, "epoch": 6.741176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.7953966856002808, "kl": 0.010684598004445434, "learning_rate": 8.383624052497649e-07, "loss": 0.00010770559310913086, "reward": 0.7346875071525574, "reward_std": 0.12401332706212997, "rewards/DrugCombAccuracyCOTORM/mean": 0.6839843988418579, "rewards/DrugCombAccuracyCOTORM/std": 0.3824578523635864, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.1666666567325592, "step": 4584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 444.5, "completions/min_length": 403.0, "epoch": 6.742647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.1398630142211914, "kl": 0.01244259555824101, "learning_rate": 8.382679106992685e-07, "loss": 0.00012469103967305273, "reward": 0.762499988079071, "reward_std": 0.25599944591522217, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 4585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/mean_length": 492.625, "completions/min_length": 430.0, "epoch": 6.7441176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.487287163734436, "kl": 0.010033829137682915, "learning_rate": 8.381733938645381e-07, "loss": 0.00010079145431518555, "reward": 0.5721666812896729, "reward_std": 0.24156293272972107, "rewards/DrugCombAccuracyCOTORM/mean": 0.4860416650772095, "rewards/DrugCombAccuracyCOTORM/std": 0.46809977293014526, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.5055249929428101, "step": 4586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/mean_length": 505.375, "completions/min_length": 424.0, "epoch": 6.745588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.5179940462112427, "kl": 0.014415212674066424, "learning_rate": 8.380788547518003e-07, "loss": 0.00014298781752586365, "reward": 0.6087083220481873, "reward_std": 0.24265941977500916, "rewards/DrugCombAccuracyCOTORM/mean": 0.53041672706604, "rewards/DrugCombAccuracyCOTORM/std": 0.31447046995162964, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.18726837635040283, "step": 4587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 448.625, "completions/min_length": 350.0, "epoch": 6.747058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.144775390625, "kl": 0.010518546914681792, "learning_rate": 8.379842933672832e-07, "loss": 0.00010563433170318604, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/mean_length": 515.8125, "completions/min_length": 419.0, "epoch": 6.748529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.8382185101509094, "kl": 0.007755719474516809, "learning_rate": 8.378897097172161e-07, "loss": 7.733702659606934e-05, "reward": 0.7369047999382019, "reward_std": 0.18535201251506805, "rewards/DrugCombAccuracyCOTORM/mean": 0.6919642686843872, "rewards/DrugCombAccuracyCOTORM/std": 0.4315127432346344, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 4589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 473.8125, "completions/min_length": 437.0, "epoch": 6.75, "frac_reward_zero_std": 1.0, "grad_norm": 0.023837801069021225, "kl": 0.008815614157356322, "learning_rate": 8.377951038078301e-07, "loss": 8.74817997100763e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 491.3125, "completions/min_length": 408.0, "epoch": 6.751470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.010402798652649, "kl": 0.012280905619263649, "learning_rate": 8.377004756453575e-07, "loss": 0.0001208186149597168, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 4591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 425.5, "completions/min_length": 348.0, "epoch": 6.752941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.018562868237495422, "kl": 0.009345171973109245, "learning_rate": 8.376058252360321e-07, "loss": 9.292612958233804e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 452.8125, "completions/min_length": 385.0, "epoch": 6.754411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.013996385969221592, "kl": 0.00800732709467411, "learning_rate": 8.375111525860894e-07, "loss": 7.996321801329032e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 442.9375, "completions/min_length": 392.0, "epoch": 6.7558823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.021651295945048332, "kl": 0.007966784411109984, "learning_rate": 8.374164577017662e-07, "loss": 7.981211820151657e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 478.125, "completions/min_length": 444.0, "epoch": 6.757352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.02981512062251568, "kl": 0.009345209808088839, "learning_rate": 8.373217405893005e-07, "loss": 9.364167635794729e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 486.5625, "completions/min_length": 415.0, "epoch": 6.758823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.1835678815841675, "kl": 0.010327831376343966, "learning_rate": 8.37227001254932e-07, "loss": 0.00010291180660715327, "reward": 0.7875000238418579, "reward_std": 0.2295181304216385, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 483.5625, "completions/min_length": 414.0, "epoch": 6.760294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 1.160927653312683, "kl": 0.010035126120783389, "learning_rate": 8.371322397049025e-07, "loss": 9.993786807172e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 4597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/mean_length": 499.3125, "completions/min_length": 398.0, "epoch": 6.761764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.2714139223098755, "kl": 0.01047632796689868, "learning_rate": 8.37037455945454e-07, "loss": 0.00010529905557632446, "reward": 0.6738749742507935, "reward_std": 0.28305748105049133, "rewards/DrugCombAccuracyCOTORM/mean": 0.6212500333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.4133743345737457, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7687499523162842, "rewards/DrugCombCoverageCOTORM/std": 0.49448344111442566, "step": 4598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 489.5, "completions/min_length": 452.0, "epoch": 6.7632352941176475, "frac_reward_zero_std": 1.0, "grad_norm": 0.01169954240322113, "kl": 0.008040537126362324, "learning_rate": 8.369426499828309e-07, "loss": 8.020142558962107e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 445.25, "completions/min_length": 395.0, "epoch": 6.764705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.009781371802091599, "kl": 0.006213756394572556, "learning_rate": 8.368478218232787e-07, "loss": 6.203041266417131e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 461.0625, "completions/min_length": 367.0, "epoch": 6.766176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.080717921257019, "kl": 0.00709256948903203, "learning_rate": 8.367529714730443e-07, "loss": 7.143186667235568e-05, "reward": 0.75, "reward_std": 0.20701967179775238, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 436.9375, "completions/min_length": 410.0, "epoch": 6.767647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.021189754828810692, "kl": 0.00803626875858754, "learning_rate": 8.366580989383765e-07, "loss": 7.973860192578286e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 494.25, "completions/min_length": 420.0, "epoch": 6.769117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.8872703909873962, "kl": 0.01160755311138928, "learning_rate": 8.365632042255251e-07, "loss": 0.0001170913310488686, "reward": 0.8999999761581421, "reward_std": 0.10690449178218842, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.22360680997371674, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 402.375, "completions/min_length": 357.0, "epoch": 6.770588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.0075642429292202, "kl": 0.0051716623129323125, "learning_rate": 8.364682873407415e-07, "loss": 5.157282066647895e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/mean_length": 485.125, "completions/min_length": 389.0, "epoch": 6.772058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.9859574437141418, "kl": 0.011132950894534588, "learning_rate": 8.363733482902787e-07, "loss": 0.00011227279901504517, "reward": 0.3071666657924652, "reward_std": 0.09037980437278748, "rewards/DrugCombAccuracyCOTORM/mean": 0.1756249964237213, "rewards/DrugCombAccuracyCOTORM/std": 0.1554764211177826, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.06085805967450142, "step": 4605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 441.75, "completions/min_length": 391.0, "epoch": 6.773529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.024209627881646156, "kl": 0.010923116118647158, "learning_rate": 8.362783870803909e-07, "loss": 0.00010966917034238577, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 453.625, "completions/min_length": 423.0, "epoch": 6.775, "frac_reward_zero_std": 1.0, "grad_norm": 0.01239908766001463, "kl": 0.007754723890684545, "learning_rate": 8.36183403717334e-07, "loss": 7.749705400783569e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 441.125, "completions/min_length": 378.0, "epoch": 6.776470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.029510656371712685, "kl": 0.011960362317040563, "learning_rate": 8.360883982073652e-07, "loss": 0.00012020742724416777, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 448.9375, "completions/min_length": 385.0, "epoch": 6.777941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 1.1008715629577637, "kl": 0.008570775738917291, "learning_rate": 8.359933705567433e-07, "loss": 8.65831971168518e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 461.1875, "completions/min_length": 401.0, "epoch": 6.779411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.4417338371276855, "kl": 0.011315579409711063, "learning_rate": 8.358983207717285e-07, "loss": 0.00011312216520309448, "reward": 0.8937499523162842, "reward_std": 0.3005203604698181, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 461.25, "completions/min_length": 411.0, "epoch": 6.780882352941177, "frac_reward_zero_std": 0.0, "grad_norm": 1.8679158687591553, "kl": 0.014270241605117917, "learning_rate": 8.358032488585822e-07, "loss": 0.00014246255159378052, "reward": 0.8812500238418579, "reward_std": 0.3358757197856903, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5439056158065796, "step": 4611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/mean_length": 505.0, "completions/min_length": 415.0, "epoch": 6.7823529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.020679691806435585, "kl": 0.008670606999658048, "learning_rate": 8.35708154823568e-07, "loss": 8.681378676556051e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 471.0, "completions/min_length": 404.0, "epoch": 6.783823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 1.2930506467819214, "kl": 0.007806923124007881, "learning_rate": 8.356130386729498e-07, "loss": 7.853284478187561e-05, "reward": 0.7937500476837158, "reward_std": 0.36611872911453247, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 442.5, "completions/min_length": 365.0, "epoch": 6.785294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.8141291737556458, "kl": 0.008540131850168109, "learning_rate": 8.355179004129941e-07, "loss": 8.46758484840393e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 473.75, "completions/min_length": 425.0, "epoch": 6.786764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.405583143234253, "kl": 0.01186233595944941, "learning_rate": 8.354227400499683e-07, "loss": 0.00011947005987167358, "reward": 0.31168332695961, "reward_std": 0.24326035380363464, "rewards/DrugCombAccuracyCOTORM/mean": 0.24324999749660492, "rewards/DrugCombAccuracyCOTORM/std": 0.3117579221725464, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.17083333432674408, "rewards/DrugCombCoverageCOTORM/std": 0.8265838027000427, "step": 4615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 482.3125, "completions/min_length": 447.0, "epoch": 6.788235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.008865396492183208, "kl": 0.0069119170075282454, "learning_rate": 8.353275575901411e-07, "loss": 6.922292959643528e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 433.875, "completions/min_length": 311.0, "epoch": 6.7897058823529415, "frac_reward_zero_std": 0.0, "grad_norm": 1.424408197402954, "kl": 0.010396136902272701, "learning_rate": 8.352323530397829e-07, "loss": 0.00010294467210769653, "reward": 0.8312499523162842, "reward_std": 0.36740854382514954, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 4617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/mean_length": 482.4375, "completions/min_length": 396.0, "epoch": 6.791176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.16555655002594, "kl": 0.012946395203471184, "learning_rate": 8.351371264051659e-07, "loss": 0.00013090670108795166, "reward": 0.9310833215713501, "reward_std": 0.1580110341310501, "rewards/DrugCombAccuracyCOTORM/mean": 0.92166668176651, "rewards/DrugCombAccuracyCOTORM/std": 0.2537715435028076, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 432.3125, "completions/min_length": 376.0, "epoch": 6.79264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.0395815446972847, "kl": 0.008326695999130607, "learning_rate": 8.35041877692563e-07, "loss": 8.221747702918947e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 380.5625, "completions/min_length": 308.0, "epoch": 6.794117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.0756345987319946, "kl": 0.007636314956471324, "learning_rate": 8.34946606908249e-07, "loss": 7.627556624356657e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/mean_length": 542.4375, "completions/min_length": 447.0, "epoch": 6.795588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.2670975923538208, "kl": 0.009886298095807433, "learning_rate": 8.348513140585003e-07, "loss": 9.88990068435669e-05, "reward": 0.5766249895095825, "reward_std": 0.27215832471847534, "rewards/DrugCombAccuracyCOTORM/mean": 0.49291667342185974, "rewards/DrugCombAccuracyCOTORM/std": 0.49026355147361755, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8229166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5072392821311951, "step": 4621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 475.125, "completions/min_length": 413.0, "epoch": 6.797058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.7692631483078003, "kl": 0.007717761502135545, "learning_rate": 8.347559991495945e-07, "loss": 7.80746340751648e-05, "reward": 0.987500011920929, "reward_std": 0.0353553406894207, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 431.25, "completions/min_length": 399.0, "epoch": 6.798529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.7800746560096741, "kl": 0.008611278492026031, "learning_rate": 8.346606621878106e-07, "loss": 8.616248669568449e-05, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 4623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 419.1875, "completions/min_length": 389.0, "epoch": 6.8, "frac_reward_zero_std": 1.0, "grad_norm": 0.0203991886228323, "kl": 0.008216481015551835, "learning_rate": 8.34565303179429e-07, "loss": 8.177482231985778e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 470.875, "completions/min_length": 414.0, "epoch": 6.801470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.270984172821045, "kl": 0.012616149615496397, "learning_rate": 8.344699221307321e-07, "loss": 0.00012610480189323425, "reward": 0.8142499923706055, "reward_std": 0.20634259283542633, "rewards/DrugCombAccuracyCOTORM/mean": 0.7912499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.3766496777534485, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5013870000839233, "step": 4625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/mean_length": 512.25, "completions/min_length": 416.0, "epoch": 6.802941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9067228436470032, "kl": 0.009652799810282886, "learning_rate": 8.343745190480032e-07, "loss": 9.716561180539429e-05, "reward": 0.6094445586204529, "reward_std": 0.03607141971588135, "rewards/DrugCombAccuracyCOTORM/mean": 0.5407119393348694, "rewards/DrugCombAccuracyCOTORM/std": 0.47817933559417725, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.768750011920929, "rewards/DrugCombCoverageCOTORM/std": 0.2414366751909256, "step": 4626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 460.1875, "completions/min_length": 359.0, "epoch": 6.804411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.048112392425537, "kl": 0.006582818925380707, "learning_rate": 8.342790939375271e-07, "loss": 6.495416164398193e-05, "reward": 0.942187488079071, "reward_std": 0.16351844370365143, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 420.25, "completions/min_length": 368.0, "epoch": 6.805882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.01660754717886448, "kl": 0.007980664959177375, "learning_rate": 8.341836468055902e-07, "loss": 7.930037099868059e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 428.125, "completions/min_length": 354.0, "epoch": 6.807352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.02038406766951084, "kl": 0.008007041993550956, "learning_rate": 8.340881776584805e-07, "loss": 8.040251850616187e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 410.625, "completions/min_length": 366.0, "epoch": 6.8088235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.35389024019241333, "kl": 0.018872743123210967, "learning_rate": 8.33992686502487e-07, "loss": 0.00018659851048141718, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 487.4375, "completions/min_length": 415.0, "epoch": 6.810294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.018778398633003235, "kl": 0.008275022846646607, "learning_rate": 8.338971733439007e-07, "loss": 8.201059245038778e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/mean_length": 579.3125, "completions/min_length": 478.0, "epoch": 6.811764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.4156081676483154, "kl": 0.012261252151802182, "learning_rate": 8.338016381890134e-07, "loss": 0.0001226365566253662, "reward": 0.2608124911785126, "reward_std": 0.19235113263130188, "rewards/DrugCombAccuracyCOTORM/mean": 0.1619531214237213, "rewards/DrugCombAccuracyCOTORM/std": 0.2828337550163269, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3125, "rewards/DrugCombCoverageCOTORM/std": 0.7197608351707458, "step": 4632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 473.375, "completions/min_length": 444.0, "epoch": 6.813235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9888238310813904, "kl": 0.007582821883261204, "learning_rate": 8.33706081044119e-07, "loss": 7.613934576511383e-05, "reward": 0.887499988079071, "reward_std": 0.21001701056957245, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 430.125, "completions/min_length": 375.0, "epoch": 6.814705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 2.8337295055389404, "kl": 0.009879450430162251, "learning_rate": 8.336105019155125e-07, "loss": 9.927153587341309e-05, "reward": 0.9937499761581421, "reward_std": 0.017677659168839455, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/mean_length": 533.75, "completions/min_length": 456.0, "epoch": 6.8161764705882355, "frac_reward_zero_std": 0.0, "grad_norm": 1.747301697731018, "kl": 0.01211217395029962, "learning_rate": 8.335149008094904e-07, "loss": 0.00012113898992538452, "reward": 0.7520833015441895, "reward_std": 0.3235413432121277, "rewards/DrugCombAccuracyCOTORM/mean": 0.7395833134651184, "rewards/DrugCombAccuracyCOTORM/std": 0.3696281909942627, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6041666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.690879762172699, "step": 4635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 488.9375, "completions/min_length": 431.0, "epoch": 6.817647058823529, "frac_reward_zero_std": 0.0, "grad_norm": 1.369619369506836, "kl": 0.008401264203712344, "learning_rate": 8.334192777323507e-07, "loss": 8.411705493927002e-05, "reward": 0.6089166402816772, "reward_std": 0.35489171743392944, "rewards/DrugCombAccuracyCOTORM/mean": 0.5137500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.5050000548362732, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 4636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/mean_length": 493.4375, "completions/min_length": 418.0, "epoch": 6.819117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.012340589426457882, "kl": 0.008036221377551556, "learning_rate": 8.333236326903927e-07, "loss": 8.090953633654863e-05, "reward": 0.625333309173584, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5733333230018616, "rewards/DrugCombAccuracyCOTORM/std": 0.44065946340560913, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.3442651927471161, "step": 4637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 420.9375, "completions/min_length": 346.0, "epoch": 6.820588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.0813848152756691, "kl": 0.009714557440020144, "learning_rate": 8.332279656899173e-07, "loss": 9.687333658803254e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 427.625, "completions/min_length": 364.0, "epoch": 6.822058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.014472251757979393, "kl": 0.008324108202941716, "learning_rate": 8.331322767372269e-07, "loss": 8.413911564275622e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 448.9375, "completions/min_length": 407.0, "epoch": 6.823529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.01632620021700859, "kl": 0.007809723727405071, "learning_rate": 8.330365658386252e-07, "loss": 7.894624286564067e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 446.75, "completions/min_length": 356.0, "epoch": 6.825, "frac_reward_zero_std": 1.0, "grad_norm": 0.011373089626431465, "kl": 0.00834834878332913, "learning_rate": 8.329408330004171e-07, "loss": 8.362864900846034e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 487.3125, "completions/min_length": 434.0, "epoch": 6.826470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.6154217720031738, "kl": 0.010042755748145282, "learning_rate": 8.328450782289097e-07, "loss": 0.00010173767805099487, "reward": 0.7926666736602783, "reward_std": 0.35680919885635376, "rewards/DrugCombAccuracyCOTORM/mean": 0.7616666555404663, "rewards/DrugCombAccuracyCOTORM/std": 0.39767101407051086, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.3442651927471161, "step": 4642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 436.9375, "completions/min_length": 356.0, "epoch": 6.827941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.0288536548614502, "kl": 0.011215322418138385, "learning_rate": 8.327493015304108e-07, "loss": 0.00011266481305938214, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/mean_length": 457.5, "completions/min_length": 390.0, "epoch": 6.829411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.8143852949142456, "kl": 0.01101925119291991, "learning_rate": 8.326535029112301e-07, "loss": 0.00011231750249862671, "reward": 0.5149333477020264, "reward_std": 0.042237844318151474, "rewards/DrugCombAccuracyCOTORM/mean": 0.5082499980926514, "rewards/DrugCombAccuracyCOTORM/std": 0.5088768601417542, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0833333432674408, "rewards/DrugCombCoverageCOTORM/std": 1.0, "step": 4644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 437.0625, "completions/min_length": 357.0, "epoch": 6.830882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.013021859340369701, "kl": 0.007937911781482399, "learning_rate": 8.325576823776783e-07, "loss": 7.936484325909987e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 459.4375, "completions/min_length": 413.0, "epoch": 6.83235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.027353839948773384, "kl": 0.007215773337520659, "learning_rate": 8.324618399360681e-07, "loss": 7.196031947387382e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 418.6875, "completions/min_length": 369.0, "epoch": 6.833823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.7883909940719604, "kl": 0.010981598636135459, "learning_rate": 8.323659755927133e-07, "loss": 0.00011014620395144448, "reward": 0.6603333353996277, "reward_std": 0.03111269697546959, "rewards/DrugCombAccuracyCOTORM/mean": 0.5962499976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.4203629493713379, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 4647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 452.8125, "completions/min_length": 400.0, "epoch": 6.8352941176470585, "frac_reward_zero_std": 0.5, "grad_norm": 0.9100296497344971, "kl": 0.00933112483471632, "learning_rate": 8.32270089353929e-07, "loss": 9.293840412283316e-05, "reward": 0.843250036239624, "reward_std": 0.16757279634475708, "rewards/DrugCombAccuracyCOTORM/mean": 0.8118749856948853, "rewards/DrugCombAccuracyCOTORM/std": 0.3365282416343689, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.11180340498685837, "step": 4648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 446.875, "completions/min_length": 352.0, "epoch": 6.836764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.11908083409070969, "kl": 0.014280958217568696, "learning_rate": 8.321741812260321e-07, "loss": 0.0001392077247146517, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 441.875, "completions/min_length": 396.0, "epoch": 6.838235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01167961023747921, "kl": 0.008569297031499445, "learning_rate": 8.320782512153408e-07, "loss": 8.563373557990417e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 497.0, "completions/min_length": 425.0, "epoch": 6.839705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 1.583132266998291, "kl": 0.011621302575804293, "learning_rate": 8.319822993281748e-07, "loss": 0.00011611729860305786, "reward": 0.625, "reward_std": 0.4475547969341278, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 4651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 484.125, "completions/min_length": 395.0, "epoch": 6.841176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.02360445074737072, "kl": 0.008269679150544107, "learning_rate": 8.318863255708549e-07, "loss": 8.304783841595054e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/mean_length": 475.625, "completions/min_length": 343.0, "epoch": 6.8426470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.8905052542686462, "kl": 0.00990113231819123, "learning_rate": 8.317903299497038e-07, "loss": 9.90256667137146e-05, "reward": 0.6458333134651184, "reward_std": 0.14330288767814636, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 4653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 455.4375, "completions/min_length": 370.0, "epoch": 6.844117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.7969822883605957, "kl": 0.00757217186037451, "learning_rate": 8.316943124710456e-07, "loss": 7.605992868775502e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 4654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 453.8125, "completions/min_length": 399.0, "epoch": 6.845588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.4313348531723022, "kl": 0.015930112916976213, "learning_rate": 8.315982731412053e-07, "loss": 0.0001585636055096984, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 4655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 486.1875, "completions/min_length": 416.0, "epoch": 6.847058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 0.816030740737915, "kl": 0.009245070861652493, "learning_rate": 8.315022119665101e-07, "loss": 9.251023584511131e-05, "reward": 0.9551249742507935, "reward_std": 0.12692566215991974, "rewards/DrugCombAccuracyCOTORM/mean": 0.9478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.20874999463558197, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 4656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 447.6875, "completions/min_length": 402.0, "epoch": 6.848529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.014102605171501637, "kl": 0.00821211060974747, "learning_rate": 8.314061289532879e-07, "loss": 8.229170634876937e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 452.375, "completions/min_length": 374.0, "epoch": 6.85, "frac_reward_zero_std": 0.5, "grad_norm": 0.8982497453689575, "kl": 0.008881702669896185, "learning_rate": 8.313100241078687e-07, "loss": 8.872878970578313e-05, "reward": 0.952625036239624, "reward_std": 0.06717978417873383, "rewards/DrugCombAccuracyCOTORM/mean": 0.9524999856948853, "rewards/DrugCombAccuracyCOTORM/std": 0.10212194174528122, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.2916666865348816, "step": 4658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 481.9375, "completions/min_length": 438.0, "epoch": 6.851470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.4276448488235474, "kl": 0.011183115420863032, "learning_rate": 8.312138974365836e-07, "loss": 0.00011125167657155544, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 471.1875, "completions/min_length": 377.0, "epoch": 6.852941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.030693622305989265, "kl": 0.008195599541068077, "learning_rate": 8.311177489457651e-07, "loss": 8.22060537757352e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 446.4375, "completions/min_length": 378.0, "epoch": 6.854411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.00975972693413496, "kl": 0.00878574070520699, "learning_rate": 8.310215786417473e-07, "loss": 8.816111221676692e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 445.8125, "completions/min_length": 341.0, "epoch": 6.855882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.01462638285011053, "kl": 0.008041637833230197, "learning_rate": 8.309253865308657e-07, "loss": 7.959338836371899e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/mean_length": 523.0625, "completions/min_length": 447.0, "epoch": 6.857352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.0102089643478394, "kl": 0.01091746729798615, "learning_rate": 8.308291726194568e-07, "loss": 0.00010700095299398527, "reward": 0.9178333282470703, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.9025000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.26642072200775146, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 4663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 462.75, "completions/min_length": 422.0, "epoch": 6.858823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.153018832206726, "kl": 0.013412985252216458, "learning_rate": 8.307329369138594e-07, "loss": 0.00013354781549423933, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 469.125, "completions/min_length": 403.0, "epoch": 6.860294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.7135790586471558, "kl": 0.010331327328458428, "learning_rate": 8.306366794204132e-07, "loss": 0.00010302655573468655, "reward": 0.543749988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 414.6875, "completions/min_length": 359.0, "epoch": 6.8617647058823525, "frac_reward_zero_std": 0.5, "grad_norm": 1.0389597415924072, "kl": 0.00980017357505858, "learning_rate": 8.305404001454591e-07, "loss": 9.839236736297607e-05, "reward": 0.887499988079071, "reward_std": 0.21001701056957245, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 459.1875, "completions/min_length": 409.0, "epoch": 6.863235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9754319190979004, "kl": 0.009458818356506526, "learning_rate": 8.3044409909534e-07, "loss": 9.427964687347412e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 498.25, "completions/min_length": 474.0, "epoch": 6.864705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.08437000215053558, "kl": 0.011774703278206289, "learning_rate": 8.303477762764e-07, "loss": 0.000119180949695874, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 458.25, "completions/min_length": 412.0, "epoch": 6.866176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.012462485581636429, "kl": 0.009525513276457787, "learning_rate": 8.302514316949844e-07, "loss": 9.567652159603313e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/mean_length": 511.4375, "completions/min_length": 382.0, "epoch": 6.867647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.0390387773513794, "kl": 0.01913271425291896, "learning_rate": 8.301550653574402e-07, "loss": 0.00019499317568261176, "reward": 0.782620370388031, "reward_std": 0.1281885802745819, "rewards/DrugCombAccuracyCOTORM/mean": 0.7492389678955078, "rewards/DrugCombAccuracyCOTORM/std": 0.3410577178001404, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8322916626930237, "rewards/DrugCombCoverageCOTORM/std": 0.23073706030845642, "step": 4670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 427.6875, "completions/min_length": 364.0, "epoch": 6.8691176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.011523498222231865, "kl": 0.007670934312045574, "learning_rate": 8.30058677270116e-07, "loss": 7.670416380278766e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 452.3125, "completions/min_length": 359.0, "epoch": 6.870588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.3068361282348633, "kl": 0.011885595740750432, "learning_rate": 8.299622674393614e-07, "loss": 0.00011827051639556885, "reward": 0.643750011920929, "reward_std": 0.4098122715950012, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.8920949101448059, "step": 4672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 470.6875, "completions/min_length": 435.0, "epoch": 6.872058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.968515932559967, "kl": 0.008410386042669415, "learning_rate": 8.298658358715274e-07, "loss": 8.405407425016165e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/mean_length": 538.8125, "completions/min_length": 407.0, "epoch": 6.873529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.2159053087234497, "kl": 0.010083626257255673, "learning_rate": 8.297693825729672e-07, "loss": 9.921938180923462e-05, "reward": 0.7355499863624573, "reward_std": 0.18855692446231842, "rewards/DrugCombAccuracyCOTORM/mean": 0.6980833411216736, "rewards/DrugCombAccuracyCOTORM/std": 0.431328147649765, "rewards/DrugCombCOTFormatORM/mean": 0.9375, "rewards/DrugCombCOTFormatORM/std": 0.17078252136707306, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8020833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.3762325644493103, "step": 4674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 475.625, "completions/min_length": 423.0, "epoch": 6.875, "frac_reward_zero_std": 1.0, "grad_norm": 0.019715163856744766, "kl": 0.008711341302841902, "learning_rate": 8.296729075500343e-07, "loss": 8.688517118571326e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 431.5625, "completions/min_length": 379.0, "epoch": 6.876470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.917400598526001, "kl": 0.00930090865585953, "learning_rate": 8.295764108090848e-07, "loss": 9.310245513916016e-05, "reward": 0.4937500059604645, "reward_std": 0.017677659168839455, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.0625, "rewards/DrugCombCoverageCOTORM/std": 0.9979145526885986, "step": 4676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/mean_length": 526.375, "completions/min_length": 450.0, "epoch": 6.877941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.4005701541900635, "kl": 0.012940732762217522, "learning_rate": 8.294798923564754e-07, "loss": 0.00012943893671035767, "reward": 0.5333333611488342, "reward_std": 0.35634833574295044, "rewards/DrugCombAccuracyCOTORM/mean": 0.4166666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 432.5, "completions/min_length": 370.0, "epoch": 6.879411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9191834926605225, "kl": 0.008835394983179867, "learning_rate": 8.293833521985644e-07, "loss": 8.87066125869751e-05, "reward": 0.5841624736785889, "reward_std": 0.09218303114175797, "rewards/DrugCombAccuracyCOTORM/mean": 0.5532500147819519, "rewards/DrugCombAccuracyCOTORM/std": 0.46855753660202026, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4156250059604645, "rewards/DrugCombCoverageCOTORM/std": 0.8595966100692749, "step": 4678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 496.0625, "completions/min_length": 438.0, "epoch": 6.8808823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 1.5019084215164185, "kl": 0.010574150132015347, "learning_rate": 8.292867903417118e-07, "loss": 0.00010676681995391846, "reward": 0.6233125329017639, "reward_std": 0.3191484808921814, "rewards/DrugCombAccuracyCOTORM/mean": 0.5779687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.4977610111236572, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.609375, "rewards/DrugCombCoverageCOTORM/std": 0.4913311004638672, "step": 4679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 432.0625, "completions/min_length": 379.0, "epoch": 6.882352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.020967738702893257, "kl": 0.008923065732233226, "learning_rate": 8.291902067922789e-07, "loss": 8.930898911785334e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 446.4375, "completions/min_length": 395.0, "epoch": 6.883823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.9911527633666992, "kl": 0.00740842754021287, "learning_rate": 8.290936015566281e-07, "loss": 7.393211126327515e-05, "reward": 0.5, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 4681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/mean_length": 479.75, "completions/min_length": 364.0, "epoch": 6.885294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.9010286331176758, "kl": 0.01055229117628187, "learning_rate": 8.289969746411236e-07, "loss": 0.00010494887828826904, "reward": 0.7468750476837158, "reward_std": 0.145428866147995, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 4682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 434.0625, "completions/min_length": 352.0, "epoch": 6.886764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9655846357345581, "kl": 0.008466891013085842, "learning_rate": 8.28900326052131e-07, "loss": 8.50185751914978e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 455.0625, "completions/min_length": 355.0, "epoch": 6.8882352941176475, "frac_reward_zero_std": 0.5, "grad_norm": 0.8919477462768555, "kl": 0.011412260122597218, "learning_rate": 8.288036557960175e-07, "loss": 0.0001132203615270555, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 4684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 484.0625, "completions/min_length": 419.0, "epoch": 6.889705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.0850863456726074, "kl": 0.009424832183867693, "learning_rate": 8.287069638791509e-07, "loss": 9.363415301777422e-05, "reward": 0.872509241104126, "reward_std": 0.08295729011297226, "rewards/DrugCombAccuracyCOTORM/mean": 0.8458448648452759, "rewards/DrugCombAccuracyCOTORM/std": 0.21145817637443542, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.07453560829162598, "step": 4685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 429.3125, "completions/min_length": 368.0, "epoch": 6.891176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.9226747751235962, "kl": 0.007901971112005413, "learning_rate": 8.286102503079017e-07, "loss": 7.904189988039434e-05, "reward": 0.75, "reward_std": 0.26726123690605164, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 4686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 465.3125, "completions/min_length": 407.0, "epoch": 6.892647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.013745010830461979, "kl": 0.010218714131042361, "learning_rate": 8.285135150886406e-07, "loss": 0.00010273361840518191, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 448.8125, "completions/min_length": 408.0, "epoch": 6.894117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.009018228389322758, "kl": 0.00677061581518501, "learning_rate": 8.284167582277406e-07, "loss": 6.75037590553984e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/mean_length": 520.0625, "completions/min_length": 411.0, "epoch": 6.895588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.1044827699661255, "kl": 0.009625133476220071, "learning_rate": 8.283199797315756e-07, "loss": 9.656098700361326e-05, "reward": 0.8799480199813843, "reward_std": 0.17311137914657593, "rewards/DrugCombAccuracyCOTORM/mean": 0.8662111163139343, "rewards/DrugCombAccuracyCOTORM/std": 0.2802085280418396, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8697916865348816, "rewards/DrugCombCoverageCOTORM/std": 0.4990442395210266, "step": 4689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/mean_length": 505.125, "completions/min_length": 427.0, "epoch": 6.897058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.8626355528831482, "kl": 0.011057290597818792, "learning_rate": 8.282231796065213e-07, "loss": 0.00011198678112123162, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 4690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 413.3125, "completions/min_length": 379.0, "epoch": 6.898529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.013570079579949379, "kl": 0.007891234941780567, "learning_rate": 8.281263578589546e-07, "loss": 7.911070133559406e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 473.75, "completions/min_length": 388.0, "epoch": 6.9, "frac_reward_zero_std": 1.0, "grad_norm": 0.010275337845087051, "kl": 0.00616414297837764, "learning_rate": 8.280295144952536e-07, "loss": 6.094002310419455e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 4692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 476.125, "completions/min_length": 405.0, "epoch": 6.901470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.995379626750946, "kl": 0.01055365172214806, "learning_rate": 8.279326495217986e-07, "loss": 0.00010661780834197998, "reward": 0.7250000238418579, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 4693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/mean_length": 419.75, "completions/min_length": 367.0, "epoch": 6.902941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 1.0577489137649536, "kl": 0.009211562108248472, "learning_rate": 8.278357629449703e-07, "loss": 9.231269359588623e-05, "reward": 0.75, "reward_std": 0.20701967179775238, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 432.8125, "completions/min_length": 383.0, "epoch": 6.904411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.012304657138884068, "kl": 0.006481268093921244, "learning_rate": 8.277388547711517e-07, "loss": 6.518763984786347e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/mean_length": 568.3125, "completions/min_length": 487.0, "epoch": 6.905882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.0391689538955688, "kl": 0.008905768976546824, "learning_rate": 8.276419250067268e-07, "loss": 8.928542956709862e-05, "reward": 0.476666659116745, "reward_std": 0.13197723031044006, "rewards/DrugCombAccuracyCOTORM/mean": 0.35624998807907104, "rewards/DrugCombAccuracyCOTORM/std": 0.4242149591445923, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.17916128039360046, "step": 4696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 474.375, "completions/min_length": 403.0, "epoch": 6.9073529411764705, "frac_reward_zero_std": 0.0, "grad_norm": 1.59236478805542, "kl": 0.010249898536130786, "learning_rate": 8.27544973658081e-07, "loss": 0.00010175257921218872, "reward": 0.7749999761581421, "reward_std": 0.2121320217847824, "rewards/DrugCombAccuracyCOTORM/mean": 0.71875, "rewards/DrugCombAccuracyCOTORM/std": 0.3145764470100403, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 453.1875, "completions/min_length": 405.0, "epoch": 6.908823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.09832321107387543, "kl": 0.011158668203279376, "learning_rate": 8.274480007316013e-07, "loss": 0.00011175201507285237, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 474.625, "completions/min_length": 386.0, "epoch": 6.910294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.5665498971939087, "kl": 0.011624532286077738, "learning_rate": 8.273510062336758e-07, "loss": 0.0001175999641418457, "reward": 0.512499988079071, "reward_std": 0.46301913261413574, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 4699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 414.0625, "completions/min_length": 376.0, "epoch": 6.911764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.01181691512465477, "kl": 0.0087276017293334, "learning_rate": 8.272539901706946e-07, "loss": 8.705213258508593e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 432.0, "completions/min_length": 389.0, "epoch": 6.913235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.016384655609726906, "kl": 0.008876325329765677, "learning_rate": 8.271569525490487e-07, "loss": 8.875961066223681e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 467.5625, "completions/min_length": 388.0, "epoch": 6.9147058823529415, "frac_reward_zero_std": 1.0, "grad_norm": 0.02163628488779068, "kl": 0.008751813555136323, "learning_rate": 8.270598933751306e-07, "loss": 8.766333485255018e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 528.6875, "completions/min_length": 460.0, "epoch": 6.916176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.010606048628687859, "kl": 0.006446852115914226, "learning_rate": 8.269628126553345e-07, "loss": 6.456469418480992e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 454.375, "completions/min_length": 404.0, "epoch": 6.91764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.010239748284220695, "kl": 0.008044072426855564, "learning_rate": 8.268657103960557e-07, "loss": 8.024084672797471e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 443.75, "completions/min_length": 356.0, "epoch": 6.919117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.9687026739120483, "kl": 0.010253534652292728, "learning_rate": 8.267685866036911e-07, "loss": 0.00010231941269012168, "reward": 0.9069530963897705, "reward_std": 0.13592664897441864, "rewards/DrugCombAccuracyCOTORM/mean": 0.8846679925918579, "rewards/DrugCombAccuracyCOTORM/std": 0.2584220767021179, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9921875, "rewards/DrugCombCoverageCOTORM/std": 0.03125, "step": 4705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 439.3125, "completions/min_length": 341.0, "epoch": 6.920588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.00973331369459629, "kl": 0.005971516482532024, "learning_rate": 8.26671441284639e-07, "loss": 5.964545198366977e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/mean_length": 483.0625, "completions/min_length": 388.0, "epoch": 6.922058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.9041439890861511, "kl": 0.010132500668987632, "learning_rate": 8.265742744452988e-07, "loss": 0.00010213170025963336, "reward": 0.6621249914169312, "reward_std": 0.014842626638710499, "rewards/DrugCombAccuracyCOTORM/mean": 0.6016517877578735, "rewards/DrugCombAccuracyCOTORM/std": 0.4118228256702423, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8080357313156128, "rewards/DrugCombCoverageCOTORM/std": 0.20593862235546112, "step": 4707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 438.3125, "completions/min_length": 384.0, "epoch": 6.923529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.03022974729537964, "kl": 0.01146138203330338, "learning_rate": 8.264770860920721e-07, "loss": 0.0001146889990195632, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/mean_length": 476.3125, "completions/min_length": 391.0, "epoch": 6.925, "frac_reward_zero_std": 0.5, "grad_norm": 6.642411708831787, "kl": 0.011316313524730504, "learning_rate": 8.263798762313612e-07, "loss": 0.00011299682955723256, "reward": 0.9195833206176758, "reward_std": 0.08579113334417343, "rewards/DrugCombAccuracyCOTORM/mean": 0.9125000238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.16278821229934692, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.15957117080688477, "step": 4709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 504.5625, "completions/min_length": 429.0, "epoch": 6.926470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.0366158485412598, "kl": 0.010190897504799068, "learning_rate": 8.262826448695699e-07, "loss": 0.00010168924927711487, "reward": 0.40000003576278687, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 4710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/mean_length": 548.1875, "completions/min_length": 419.0, "epoch": 6.927941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.07023024559021, "kl": 0.00846673536580056, "learning_rate": 8.261853920131038e-07, "loss": 8.512662316206843e-05, "reward": 0.22416667640209198, "reward_std": 0.12365788966417313, "rewards/DrugCombAccuracyCOTORM/mean": 0.10833333432674408, "rewards/DrugCombAccuracyCOTORM/std": 0.23959843814373016, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.4367387592792511, "step": 4711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 450.8125, "completions/min_length": 415.0, "epoch": 6.929411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.1922801733016968, "kl": 0.007452158373780549, "learning_rate": 8.260881176683694e-07, "loss": 7.512603042414412e-05, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 4712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/mean_length": 509.4375, "completions/min_length": 432.0, "epoch": 6.930882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.7921217679977417, "kl": 0.007830841350369155, "learning_rate": 8.259908218417752e-07, "loss": 7.836605072952807e-05, "reward": 0.9367148876190186, "reward_std": 0.1440882384777069, "rewards/DrugCombAccuracyCOTORM/mean": 0.9224561452865601, "rewards/DrugCombAccuracyCOTORM/std": 0.2532098889350891, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.987500011920929, "rewards/DrugCombCoverageCOTORM/std": 0.05000000074505806, "step": 4713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 441.1875, "completions/min_length": 383.0, "epoch": 6.932352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.1580179929733276, "kl": 0.00692136213183403, "learning_rate": 8.258935045397305e-07, "loss": 6.948399095563218e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 4714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 455.375, "completions/min_length": 398.0, "epoch": 6.9338235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.017192350700497627, "kl": 0.008325083181262016, "learning_rate": 8.257961657686466e-07, "loss": 8.267742668977007e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 420.6875, "completions/min_length": 372.0, "epoch": 6.935294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.01728990115225315, "kl": 0.0097050906624645, "learning_rate": 8.256988055349356e-07, "loss": 9.629398118704557e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 434.5625, "completions/min_length": 400.0, "epoch": 6.936764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.016258664429187775, "kl": 0.009354385314509273, "learning_rate": 8.256014238450117e-07, "loss": 9.341477561974898e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 477.75, "completions/min_length": 404.0, "epoch": 6.938235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0522180795669556, "kl": 0.009632240980863571, "learning_rate": 8.255040207052899e-07, "loss": 9.529292583465576e-05, "reward": 0.7580000162124634, "reward_std": 0.18723064661026, "rewards/DrugCombAccuracyCOTORM/mean": 0.7287499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.4155234098434448, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.3464101552963257, "step": 4718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 482.25, "completions/min_length": 426.0, "epoch": 6.939705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 1.344728708267212, "kl": 0.010183864273130894, "learning_rate": 8.25406596122187e-07, "loss": 0.00010208785533905029, "reward": 0.8312499523162842, "reward_std": 0.36740854382514954, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 4719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 458.6875, "completions/min_length": 403.0, "epoch": 6.9411764705882355, "frac_reward_zero_std": 0.5, "grad_norm": 0.9749480485916138, "kl": 0.010715681361034513, "learning_rate": 8.25309150102121e-07, "loss": 0.0001074021274689585, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.0, "completions/mean_length": 531.75, "completions/min_length": 437.0, "epoch": 6.942647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.0233577489852905, "kl": 0.010751239606179297, "learning_rate": 8.252116826515115e-07, "loss": 0.00010655820369720459, "reward": 0.7917604446411133, "reward_std": 0.17571747303009033, "rewards/DrugCombAccuracyCOTORM/mean": 0.7602083683013916, "rewards/DrugCombAccuracyCOTORM/std": 0.36024728417396545, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8359375, "rewards/DrugCombCoverageCOTORM/std": 0.49311670660972595, "step": 4721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/mean_length": 488.5625, "completions/min_length": 365.0, "epoch": 6.944117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.1963173151016235, "kl": 0.01322001009248197, "learning_rate": 8.251141937767794e-07, "loss": 0.0001317209389526397, "reward": 0.7404999732971191, "reward_std": 0.1601668894290924, "rewards/DrugCombAccuracyCOTORM/mean": 0.6957142949104309, "rewards/DrugCombAccuracyCOTORM/std": 0.4057143032550812, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8392857313156128, "rewards/DrugCombCoverageCOTORM/std": 0.2142857015132904, "step": 4722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 451.4375, "completions/min_length": 366.0, "epoch": 6.945588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.8215155601501465, "kl": 0.00913165439851582, "learning_rate": 8.250166834843469e-07, "loss": 9.200721979141235e-05, "reward": 0.5513333678245544, "reward_std": 0.13765011727809906, "rewards/DrugCombAccuracyCOTORM/mean": 0.44437500834465027, "rewards/DrugCombAccuracyCOTORM/std": 0.5067934393882751, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.1666666567325592, "step": 4723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/mean_length": 488.6875, "completions/min_length": 401.0, "epoch": 6.947058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.0130071640014648, "kl": 0.010666482266969979, "learning_rate": 8.249191517806378e-07, "loss": 0.00010648369789123535, "reward": 0.6589624881744385, "reward_std": 0.1694340705871582, "rewards/DrugCombAccuracyCOTORM/mean": 0.6147187352180481, "rewards/DrugCombAccuracyCOTORM/std": 0.4721057116985321, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.671875, "rewards/DrugCombCoverageCOTORM/std": 0.4538607597351074, "step": 4724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 473.8125, "completions/min_length": 382.0, "epoch": 6.948529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.1104323863983154, "kl": 0.013366849394515157, "learning_rate": 8.248215986720773e-07, "loss": 0.00013545065303333104, "reward": 0.8422499895095825, "reward_std": 0.14144781231880188, "rewards/DrugCombAccuracyCOTORM/mean": 0.8054167032241821, "rewards/DrugCombAccuracyCOTORM/std": 0.31059530377388, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 4725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/mean_length": 485.8125, "completions/min_length": 368.0, "epoch": 6.95, "frac_reward_zero_std": 0.5, "grad_norm": 0.8069605827331543, "kl": 0.009097969392314553, "learning_rate": 8.247240241650917e-07, "loss": 9.158626198768616e-05, "reward": 0.6238095164299011, "reward_std": 0.1321129947900772, "rewards/DrugCombAccuracyCOTORM/mean": 0.5714285969734192, "rewards/DrugCombAccuracyCOTORM/std": 0.47809144854545593, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.688530445098877, "step": 4726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 444.5625, "completions/min_length": 377.0, "epoch": 6.951470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.03508584573864937, "kl": 0.008968505775555968, "learning_rate": 8.246264282661094e-07, "loss": 8.740508201299235e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 463.75, "completions/min_length": 426.0, "epoch": 6.952941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.0841569900512695, "kl": 0.012144560227170587, "learning_rate": 8.245288109815595e-07, "loss": 0.00012113267439417541, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 428.6875, "completions/min_length": 341.0, "epoch": 6.954411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.010275405831634998, "kl": 0.006663724780082703, "learning_rate": 8.244311723178727e-07, "loss": 6.690635200357065e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/mean_length": 483.8125, "completions/min_length": 368.0, "epoch": 6.955882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.6707017421722412, "kl": 0.01049088453873992, "learning_rate": 8.243335122814814e-07, "loss": 0.00010429542453493923, "reward": 0.9636562466621399, "reward_std": 0.10279566049575806, "rewards/DrugCombAccuracyCOTORM/mean": 0.9555468559265137, "rewards/DrugCombAccuracyCOTORM/std": 0.17781250178813934, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9921875, "rewards/DrugCombCoverageCOTORM/std": 0.03125, "step": 4730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 467.6875, "completions/min_length": 367.0, "epoch": 6.95735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.021917037665843964, "kl": 0.008310601231642067, "learning_rate": 8.242358308788191e-07, "loss": 8.368441922357306e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/mean_length": 497.4375, "completions/min_length": 398.0, "epoch": 6.958823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.6821104884147644, "kl": 0.007665890734642744, "learning_rate": 8.241381281163207e-07, "loss": 7.718543929513544e-05, "reward": 0.7945833206176758, "reward_std": 0.17010116577148438, "rewards/DrugCombAccuracyCOTORM/mean": 0.7562500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.3733965754508972, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.15957117080688477, "step": 4732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 506.25, "completions/min_length": 448.0, "epoch": 6.9602941176470585, "frac_reward_zero_std": 0.0, "grad_norm": 1.2417359352111816, "kl": 0.009062389493919909, "learning_rate": 8.240404040004229e-07, "loss": 9.116902947425842e-05, "reward": 0.6766176819801331, "reward_std": 0.2471892535686493, "rewards/DrugCombAccuracyCOTORM/mean": 0.6197303533554077, "rewards/DrugCombAccuracyCOTORM/std": 0.29372674226760864, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8083333373069763, "rewards/DrugCombCoverageCOTORM/std": 0.15371932089328766, "step": 4733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 433.125, "completions/min_length": 382.0, "epoch": 6.961764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.01036224514245987, "kl": 0.007070180610753596, "learning_rate": 8.239426585375632e-07, "loss": 7.052638829918578e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 493.75, "completions/min_length": 420.0, "epoch": 6.963235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.7199112176895142, "kl": 0.006630806368775666, "learning_rate": 8.238448917341809e-07, "loss": 6.633996963500977e-05, "reward": 0.550000011920929, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 488.625, "completions/min_length": 439.0, "epoch": 6.964705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.007233938667923212, "kl": 0.006045478396117687, "learning_rate": 8.237471035967167e-07, "loss": 6.0223988839425147e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 461.1875, "completions/min_length": 387.0, "epoch": 6.966176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.8850656747817993, "kl": 0.052633923245593905, "learning_rate": 8.236492941316126e-07, "loss": 0.0005066320300102234, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 434.6875, "completions/min_length": 396.0, "epoch": 6.9676470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.1640876978635788, "kl": 0.012180018937215209, "learning_rate": 8.235514633453122e-07, "loss": 0.00012068611977156252, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/mean_length": 515.75, "completions/min_length": 466.0, "epoch": 6.969117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.8155125975608826, "kl": 0.007248814683407545, "learning_rate": 8.234536112442602e-07, "loss": 7.23525881767273e-05, "reward": 0.8194944858551025, "reward_std": 0.09839566797018051, "rewards/DrugCombAccuracyCOTORM/mean": 0.8047499656677246, "rewards/DrugCombAccuracyCOTORM/std": 0.25547537207603455, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7569444179534912, "rewards/DrugCombCoverageCOTORM/std": 0.27582648396492004, "step": 4739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 487.75, "completions/min_length": 399.0, "epoch": 6.970588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.009603649377822876, "kl": 0.005936529720202088, "learning_rate": 8.233557378349028e-07, "loss": 5.9262187278363854e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 494.5, "completions/min_length": 436.0, "epoch": 6.972058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 0.9716468453407288, "kl": 0.009647906175814569, "learning_rate": 8.232578431236877e-07, "loss": 9.647756814956665e-05, "reward": 0.6896666884422302, "reward_std": 0.12584204971790314, "rewards/DrugCombAccuracyCOTORM/mean": 0.6381250023841858, "rewards/DrugCombAccuracyCOTORM/std": 0.4239884912967682, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.24720662832260132, "step": 4741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 466.625, "completions/min_length": 405.0, "epoch": 6.973529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9893016815185547, "kl": 0.0081789237447083, "learning_rate": 8.231599271170638e-07, "loss": 8.250214159488678e-05, "reward": 0.6098958253860474, "reward_std": 0.19695018231868744, "rewards/DrugCombAccuracyCOTORM/mean": 0.6041666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.4901813864707947, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.28125, "rewards/DrugCombCoverageCOTORM/std": 0.9303897023200989, "step": 4742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/mean_length": 511.9375, "completions/min_length": 450.0, "epoch": 6.975, "frac_reward_zero_std": 0.0, "grad_norm": 1.219452977180481, "kl": 0.011468995828181505, "learning_rate": 8.230619898214819e-07, "loss": 0.00011393800377845764, "reward": 0.7614583373069763, "reward_std": 0.30002570152282715, "rewards/DrugCombAccuracyCOTORM/mean": 0.7291666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.35939764976501465, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.78125, "rewards/DrugCombCoverageCOTORM/std": 0.4069705307483673, "step": 4743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 457.25, "completions/min_length": 405.0, "epoch": 6.976470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.013337221927940845, "kl": 0.007738946354947984, "learning_rate": 8.229640312433936e-07, "loss": 7.776731945341453e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 427.25, "completions/min_length": 347.0, "epoch": 6.977941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.013937198556959629, "kl": 0.009762129746377468, "learning_rate": 8.228660513892523e-07, "loss": 9.729040903039277e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 469.875, "completions/min_length": 394.0, "epoch": 6.979411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.692977786064148, "kl": 0.006520010065287352, "learning_rate": 8.227680502655126e-07, "loss": 6.511062383651733e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 4746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/mean_length": 506.8125, "completions/min_length": 413.0, "epoch": 6.980882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 1.657062292098999, "kl": 0.009236907586455345, "learning_rate": 8.226700278786304e-07, "loss": 9.302794933319092e-05, "reward": 0.5589166879653931, "reward_std": 0.4054989516735077, "rewards/DrugCombAccuracyCOTORM/mean": 0.5137500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.5050000548362732, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.6202598214149475, "step": 4747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 458.3125, "completions/min_length": 405.0, "epoch": 6.982352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.8184431791305542, "kl": 0.010754836956039071, "learning_rate": 8.225719842350636e-07, "loss": 0.0001066066324710846, "reward": 0.7705000042915344, "reward_std": 0.190548837184906, "rewards/DrugCombAccuracyCOTORM/mean": 0.7287499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.4172669053077698, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.22360680997371674, "step": 4748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 443.6875, "completions/min_length": 409.0, "epoch": 6.983823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.010858646593987942, "kl": 0.007093941560015082, "learning_rate": 8.224739193412706e-07, "loss": 7.177556835813448e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 478.0625, "completions/min_length": 400.0, "epoch": 6.985294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.8933078050613403, "kl": 0.00950977043248713, "learning_rate": 8.22375833203712e-07, "loss": 9.565427899360657e-05, "reward": 0.5975833535194397, "reward_std": 0.07148842513561249, "rewards/DrugCombAccuracyCOTORM/mean": 0.5412499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.4801371693611145, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6458333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.6607375144958496, "step": 4750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 446.75, "completions/min_length": 412.0, "epoch": 6.9867647058823525, "frac_reward_zero_std": 1.0, "grad_norm": 0.014965343289077282, "kl": 0.008244442171417177, "learning_rate": 8.222777258288494e-07, "loss": 8.218376024160534e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 487.875, "completions/min_length": 429.0, "epoch": 6.988235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.6009914875030518, "kl": 0.010481130331754684, "learning_rate": 8.221795972231457e-07, "loss": 0.0001045912504196167, "reward": 0.699999988079071, "reward_std": 0.3484410345554352, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 430.0625, "completions/min_length": 398.0, "epoch": 6.989705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 0.9734605550765991, "kl": 0.010883772512897849, "learning_rate": 8.220814473930655e-07, "loss": 0.00010822326294146478, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 442.6875, "completions/min_length": 404.0, "epoch": 6.991176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.02977096475660801, "kl": 0.010102675994858146, "learning_rate": 8.219832763450746e-07, "loss": 0.00010149989975616336, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 394.0, "completions/min_length": 312.0, "epoch": 6.992647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.9983218312263489, "kl": 0.008323040790855885, "learning_rate": 8.218850840856402e-07, "loss": 8.439517114311457e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 440.9375, "completions/min_length": 352.0, "epoch": 6.9941176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.021785704419016838, "kl": 0.009516152087599039, "learning_rate": 8.217868706212311e-07, "loss": 9.59565004450269e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 456.5, "completions/min_length": 380.0, "epoch": 6.995588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.010929330252110958, "kl": 0.007401781505905092, "learning_rate": 8.216886359583175e-07, "loss": 7.384012133115903e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 446.8125, "completions/min_length": 352.0, "epoch": 6.997058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.010903310030698776, "kl": 0.007534246193245053, "learning_rate": 8.215903801033706e-07, "loss": 7.509677379857749e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 442.0625, "completions/min_length": 395.0, "epoch": 6.998529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.011164085939526558, "kl": 0.006543761235661805, "learning_rate": 8.214921030628632e-07, "loss": 6.540575122926384e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 438.25, "completions/min_length": 396.0, "epoch": 7.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.012466556392610073, "kl": 0.008213549619540572, "learning_rate": 8.213938048432696e-07, "loss": 8.199128933483735e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 425.125, "completions/min_length": 397.0, "epoch": 7.001470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.8937835097312927, "kl": 0.006539694033563137, "learning_rate": 8.212954854510656e-07, "loss": 6.522983312606812e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 482.1875, "completions/min_length": 432.0, "epoch": 7.002941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.033538952469825745, "kl": 0.013260731473565102, "learning_rate": 8.211971448927279e-07, "loss": 0.00013454037252813578, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/mean_length": 401.6875, "completions/min_length": 376.0, "epoch": 7.004411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.0129858255386353, "kl": 0.006884791539050639, "learning_rate": 8.210987831747355e-07, "loss": 6.909875810379162e-05, "reward": 0.512499988079071, "reward_std": 0.0353553369641304, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 4763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 464.75, "completions/min_length": 416.0, "epoch": 7.0058823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.0365756750106812, "kl": 0.008486474980600178, "learning_rate": 8.210004003035675e-07, "loss": 8.402462117373943e-05, "reward": 0.8445625305175781, "reward_std": 0.10821432620286942, "rewards/DrugCombAccuracyCOTORM/mean": 0.8193750381469727, "rewards/DrugCombAccuracyCOTORM/std": 0.24990136921405792, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.890625, "rewards/DrugCombCoverageCOTORM/std": 0.18934279680252075, "step": 4764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 479.3125, "completions/min_length": 436.0, "epoch": 7.007352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.2517576217651367, "kl": 0.008153108065016568, "learning_rate": 8.209019962857058e-07, "loss": 8.07829201221466e-05, "reward": 0.737500011920929, "reward_std": 0.4153292179107666, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 4765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 497.0625, "completions/min_length": 440.0, "epoch": 7.008823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.2577910423278809, "kl": 0.0073892202926799655, "learning_rate": 8.208035711276326e-07, "loss": 7.367879152297974e-05, "reward": 0.9802083373069763, "reward_std": 0.055979274213314056, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 4766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 456.375, "completions/min_length": 407.0, "epoch": 7.010294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.8898583650588989, "kl": 0.010893090162426233, "learning_rate": 8.207051248358319e-07, "loss": 0.00010886706877499819, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 459.9375, "completions/min_length": 424.0, "epoch": 7.011764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8685082197189331, "kl": 0.008551669074222445, "learning_rate": 8.206066574167892e-07, "loss": 8.543208241462708e-05, "reward": 0.699999988079071, "reward_std": 0.2507132589817047, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 4768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 487.375, "completions/min_length": 431.0, "epoch": 7.0132352941176475, "frac_reward_zero_std": 1.0, "grad_norm": 0.01361207664012909, "kl": 0.007193409255705774, "learning_rate": 8.205081688769912e-07, "loss": 7.222715066745877e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 4769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 471.5625, "completions/min_length": 378.0, "epoch": 7.014705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.1085915565490723, "kl": 0.01066971814725548, "learning_rate": 8.204096592229262e-07, "loss": 0.0001058429479598999, "reward": 0.8967083692550659, "reward_std": 0.15700268745422363, "rewards/DrugCombAccuracyCOTORM/mean": 0.87479168176651, "rewards/DrugCombAccuracyCOTORM/std": 0.2892204523086548, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.08539126068353653, "step": 4770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 465.25, "completions/min_length": 436.0, "epoch": 7.016176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.9443861842155457, "kl": 0.009296606411226094, "learning_rate": 8.203111284610838e-07, "loss": 9.266445704270154e-05, "reward": 0.9666666984558105, "reward_std": 0.061721328645944595, "rewards/DrugCombAccuracyCOTORM/mean": 0.9583333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.11385500431060791, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 438.625, "completions/min_length": 393.0, "epoch": 7.017647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.8877084255218506, "kl": 0.01196647493634373, "learning_rate": 8.202125765979547e-07, "loss": 0.00011883676052093506, "reward": 0.8356666564941406, "reward_std": 0.17567972838878632, "rewards/DrugCombAccuracyCOTORM/mean": 0.8050000071525574, "rewards/DrugCombAccuracyCOTORM/std": 0.3488266170024872, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.14907118678092957, "step": 4772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/mean_length": 387.5, "completions/min_length": 349.0, "epoch": 7.019117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.012957449071109295, "kl": 0.007054057787172496, "learning_rate": 8.201140036400316e-07, "loss": 7.06696737324819e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/mean_length": 455.625, "completions/min_length": 367.0, "epoch": 7.020588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.08733736723661423, "kl": 0.01219415059313178, "learning_rate": 8.200154095938079e-07, "loss": 0.00011740809713955969, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/mean_length": 522.0625, "completions/min_length": 404.0, "epoch": 7.022058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.6821383833885193, "kl": 0.010042293928563595, "learning_rate": 8.199167944657789e-07, "loss": 0.00010129117436008528, "reward": 0.6381944417953491, "reward_std": 0.1466314047574997, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.881944477558136, "rewards/DrugCombCoverageCOTORM/std": 0.20774608850479126, "step": 4775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/mean_length": 493.75, "completions/min_length": 400.0, "epoch": 7.023529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.1794577836990356, "kl": 0.015737114124931395, "learning_rate": 8.198181582624411e-07, "loss": 0.00016303402662742883, "reward": 0.5552083253860474, "reward_std": 0.08368229866027832, "rewards/DrugCombAccuracyCOTORM/mean": 0.4479166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.48196646571159363, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 4776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 477.9375, "completions/min_length": 421.0, "epoch": 7.025, "frac_reward_zero_std": 0.5, "grad_norm": 0.8882942199707031, "kl": 0.009057725430466235, "learning_rate": 8.197195009902923e-07, "loss": 9.053279063664377e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 420.5, "completions/min_length": 340.0, "epoch": 7.026470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.014959941618144512, "kl": 0.008161603822372854, "learning_rate": 8.196208226558321e-07, "loss": 8.118762343656272e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 438.625, "completions/min_length": 360.0, "epoch": 7.027941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 0.7294321060180664, "kl": 0.010249075712636113, "learning_rate": 8.195221232655607e-07, "loss": 9.998679161071777e-05, "reward": 0.8895377516746521, "reward_std": 0.027558917179703712, "rewards/DrugCombAccuracyCOTORM/mean": 0.8710367679595947, "rewards/DrugCombAccuracyCOTORM/std": 0.14014220237731934, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9270833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.08539126813411713, "step": 4779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 422.125, "completions/min_length": 371.0, "epoch": 7.029411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.007775022182613611, "kl": 0.006453151814639568, "learning_rate": 8.194234028259806e-07, "loss": 6.446409679483622e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 451.125, "completions/min_length": 391.0, "epoch": 7.030882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.010907663032412529, "kl": 0.0057300240732729435, "learning_rate": 8.193246613435951e-07, "loss": 5.769592826254666e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 464.5625, "completions/min_length": 383.0, "epoch": 7.0323529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.03920901194214821, "kl": 0.010670005111023784, "learning_rate": 8.19225898824909e-07, "loss": 0.00010694652155507356, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 440.625, "completions/min_length": 380.0, "epoch": 7.033823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.008287745527923107, "kl": 0.006485686521045864, "learning_rate": 8.191271152764284e-07, "loss": 6.48937639198266e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 448.0625, "completions/min_length": 375.0, "epoch": 7.035294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.0055254697799683, "kl": 0.009997954941354692, "learning_rate": 8.190283107046611e-07, "loss": 9.956583380699158e-05, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 4784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 446.6875, "completions/min_length": 376.0, "epoch": 7.036764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.01665220968425274, "kl": 0.007468495750799775, "learning_rate": 8.189294851161163e-07, "loss": 7.490323332604021e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/mean_length": 503.0, "completions/min_length": 441.0, "epoch": 7.038235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.1327017545700073, "kl": 0.008846766431815922, "learning_rate": 8.188306385173038e-07, "loss": 8.872896432876587e-05, "reward": 0.6865631937980652, "reward_std": 0.22710853815078735, "rewards/DrugCombAccuracyCOTORM/mean": 0.6238073110580444, "rewards/DrugCombAccuracyCOTORM/std": 0.38897979259490967, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8751736283302307, "rewards/DrugCombCoverageCOTORM/std": 0.14082160592079163, "step": 4786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 477.4375, "completions/min_length": 385.0, "epoch": 7.0397058823529415, "frac_reward_zero_std": 0.5, "grad_norm": 1.3028117418289185, "kl": 0.012715794495306909, "learning_rate": 8.187317709147359e-07, "loss": 0.00012499094009399414, "reward": 0.7979166507720947, "reward_std": 0.21610504388809204, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 4787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/mean_length": 467.0625, "completions/min_length": 375.0, "epoch": 7.041176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.07853393256664276, "kl": 0.010239726165309548, "learning_rate": 8.186328823149255e-07, "loss": 0.00010184159327764064, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/mean_length": 527.5, "completions/min_length": 420.0, "epoch": 7.04264705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.3183125257492065, "kl": 0.008505270583555102, "learning_rate": 8.185339727243872e-07, "loss": 8.442997932434082e-05, "reward": 0.7725104093551636, "reward_std": 0.29165810346603394, "rewards/DrugCombAccuracyCOTORM/mean": 0.7263802289962769, "rewards/DrugCombAccuracyCOTORM/std": 0.36417731642723083, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9140625, "rewards/DrugCombCoverageCOTORM/std": 0.24882537126541138, "step": 4789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 430.875, "completions/min_length": 389.0, "epoch": 7.044117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.02427913248538971, "kl": 0.00891347520519048, "learning_rate": 8.184350421496369e-07, "loss": 8.871534373611212e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/mean_length": 525.5, "completions/min_length": 419.0, "epoch": 7.045588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.3632174730300903, "kl": 0.008354677585884929, "learning_rate": 8.183360905971918e-07, "loss": 8.378922939300537e-05, "reward": 0.7757499814033508, "reward_std": 0.3389550447463989, "rewards/DrugCombAccuracyCOTORM/mean": 0.7379167079925537, "rewards/DrugCombAccuracyCOTORM/std": 0.4045058786869049, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.2713136672973633, "step": 4791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 425.875, "completions/min_length": 385.0, "epoch": 7.047058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.011176543310284615, "kl": 0.00637620035558939, "learning_rate": 8.182371180735707e-07, "loss": 6.39539648545906e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 414.0, "completions/min_length": 373.0, "epoch": 7.048529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.014694745652377605, "kl": 0.006884330068714917, "learning_rate": 8.181381245852936e-07, "loss": 6.934141129022464e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/mean_length": 499.0625, "completions/min_length": 386.0, "epoch": 7.05, "frac_reward_zero_std": 1.0, "grad_norm": 0.012455782853066921, "kl": 0.009077832801267505, "learning_rate": 8.180391101388819e-07, "loss": 9.208000847138464e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 4794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 449.625, "completions/min_length": 373.0, "epoch": 7.051470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.1889866590499878, "kl": 0.008001696434803307, "learning_rate": 8.179400747408586e-07, "loss": 8.098443504422903e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 418.75, "completions/min_length": 358.0, "epoch": 7.052941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.018509769812226295, "kl": 0.009043360478244722, "learning_rate": 8.178410183977478e-07, "loss": 9.115948341786861e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 467.1875, "completions/min_length": 376.0, "epoch": 7.054411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 1.2981690168380737, "kl": 0.008307685609906912, "learning_rate": 8.177419411160749e-07, "loss": 8.351355791091919e-05, "reward": 0.8685833215713501, "reward_std": 0.33438175916671753, "rewards/DrugCombAccuracyCOTORM/mean": 0.85916668176651, "rewards/DrugCombAccuracyCOTORM/std": 0.34125587344169617, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5439056158065796, "step": 4797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 450.75, "completions/min_length": 401.0, "epoch": 7.055882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.013039867393672466, "kl": 0.006712990463711321, "learning_rate": 8.176428429023672e-07, "loss": 6.685891275992617e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 423.5, "completions/min_length": 352.0, "epoch": 7.057352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.9283648729324341, "kl": 0.008719255682080984, "learning_rate": 8.175437237631528e-07, "loss": 8.692849951330572e-05, "reward": 0.7961000204086304, "reward_std": 0.1711217761039734, "rewards/DrugCombAccuracyCOTORM/mean": 0.7576249837875366, "rewards/DrugCombAccuracyCOTORM/std": 0.3728858530521393, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8999999761581421, "rewards/DrugCombCoverageCOTORM/std": 0.18539246916770935, "step": 4799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/mean_length": 455.25, "completions/min_length": 404.0, "epoch": 7.0588235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 0.7675697207450867, "kl": 0.011205557500943542, "learning_rate": 8.174445837049614e-07, "loss": 0.00011250117677263916, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 441.75, "completions/min_length": 406.0, "epoch": 7.060294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.8782395720481873, "kl": 0.007986202253960073, "learning_rate": 8.173454227343242e-07, "loss": 7.946789264678955e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 444.625, "completions/min_length": 403.0, "epoch": 7.061764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.013196890242397785, "kl": 0.008384346612729132, "learning_rate": 8.172462408577738e-07, "loss": 8.355086902156472e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 458.8125, "completions/min_length": 405.0, "epoch": 7.063235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.009981472045183182, "kl": 0.006387061323039234, "learning_rate": 8.171470380818437e-07, "loss": 6.383971776813269e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 431.1875, "completions/min_length": 370.0, "epoch": 7.064705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.013583430089056492, "kl": 0.008534691063687205, "learning_rate": 8.170478144130695e-07, "loss": 8.603043534094468e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 431.6875, "completions/min_length": 357.0, "epoch": 7.0661764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.012499943375587463, "kl": 0.007934715365990996, "learning_rate": 8.169485698579875e-07, "loss": 7.909501437097788e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 4805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 466.375, "completions/min_length": 405.0, "epoch": 7.067647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.028293974697589874, "kl": 0.009296595235355198, "learning_rate": 8.168493044231359e-07, "loss": 9.35176940402016e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 689.0, "completions/mean_length": 519.5625, "completions/min_length": 401.0, "epoch": 7.069117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 1.4008222818374634, "kl": 0.012203363003209233, "learning_rate": 8.16750018115054e-07, "loss": 0.0001219436526298523, "reward": 0.6314166784286499, "reward_std": 0.28299736976623535, "rewards/DrugCombAccuracyCOTORM/mean": 0.5679166316986084, "rewards/DrugCombAccuracyCOTORM/std": 0.45894384384155273, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7708333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.2713136672973633, "step": 4807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/mean_length": 419.375, "completions/min_length": 368.0, "epoch": 7.070588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.015961365774273872, "kl": 0.007197552244178951, "learning_rate": 8.166507109402824e-07, "loss": 7.214889774331823e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 439.125, "completions/min_length": 386.0, "epoch": 7.072058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.845017671585083, "kl": 0.009279634687118232, "learning_rate": 8.165513829053633e-07, "loss": 9.235349716618657e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 465.6875, "completions/min_length": 369.0, "epoch": 7.073529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.0917409658432007, "kl": 0.02414276171475649, "learning_rate": 8.164520340168403e-07, "loss": 0.00024068355560302734, "reward": 0.7389583587646484, "reward_std": 0.10547676682472229, "rewards/DrugCombAccuracyCOTORM/mean": 0.682812511920929, "rewards/DrugCombAccuracyCOTORM/std": 0.3714519739151001, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9270833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.08539126813411713, "step": 4810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 513.9375, "completions/min_length": 428.0, "epoch": 7.075, "frac_reward_zero_std": 0.5, "grad_norm": 0.9887325167655945, "kl": 0.011119002709165215, "learning_rate": 8.163526642812581e-07, "loss": 0.00011307066597510129, "reward": 0.6017500162124634, "reward_std": 0.04301079735159874, "rewards/DrugCombAccuracyCOTORM/mean": 0.5412499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.47761037945747375, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.35939764976501465, "step": 4811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 428.25, "completions/min_length": 388.0, "epoch": 7.076470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.09369616210460663, "kl": 0.011651055538095534, "learning_rate": 8.162532737051628e-07, "loss": 0.00011736992746591568, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 458.625, "completions/min_length": 387.0, "epoch": 7.077941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9247642159461975, "kl": 0.009675407316535711, "learning_rate": 8.161538622951023e-07, "loss": 9.738653898239136e-05, "reward": 0.893750011920929, "reward_std": 0.18212535977363586, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 432.6875, "completions/min_length": 372.0, "epoch": 7.079411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9397265315055847, "kl": 0.007775094592943788, "learning_rate": 8.160544300576255e-07, "loss": 7.780641317367554e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 444.8125, "completions/min_length": 382.0, "epoch": 7.080882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.0174880251288414, "kl": 0.0073416492668911815, "learning_rate": 8.159549769992826e-07, "loss": 7.336575799854472e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 434.25, "completions/min_length": 381.0, "epoch": 7.08235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.011938855983316898, "kl": 0.0082986606284976, "learning_rate": 8.158555031266254e-07, "loss": 8.290640835184604e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 445.25, "completions/min_length": 389.0, "epoch": 7.083823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.02017734758555889, "kl": 0.009925756487064064, "learning_rate": 8.157560084462071e-07, "loss": 9.924682672135532e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 446.25, "completions/min_length": 367.0, "epoch": 7.0852941176470585, "frac_reward_zero_std": 1.0, "grad_norm": 0.013215601444244385, "kl": 0.007785871392115951, "learning_rate": 8.15656492964582e-07, "loss": 7.780255691614002e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/mean_length": 504.625, "completions/min_length": 451.0, "epoch": 7.086764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.4993553161621094, "kl": 0.01405150513164699, "learning_rate": 8.155569566883059e-07, "loss": 0.00014093518257141113, "reward": 0.6625000238418579, "reward_std": 0.4609731435775757, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 4819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 420.75, "completions/min_length": 388.0, "epoch": 7.088235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.014579822309315205, "kl": 0.0083926614606753, "learning_rate": 8.15457399623936e-07, "loss": 8.370437717530876e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/mean_length": 500.4375, "completions/min_length": 382.0, "epoch": 7.089705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 2.316436290740967, "kl": 0.008981859311461449, "learning_rate": 8.15357821778031e-07, "loss": 9.0180998085998e-05, "reward": 0.7373633980751038, "reward_std": 0.11983926594257355, "rewards/DrugCombAccuracyCOTORM/mean": 0.6775636076927185, "rewards/DrugCombAccuracyCOTORM/std": 0.39214566349983215, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.953125, "rewards/DrugCombCoverageCOTORM/std": 0.10077822208404541, "step": 4821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 498.1875, "completions/min_length": 435.0, "epoch": 7.091176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.019179172813892365, "kl": 0.009641800541430712, "learning_rate": 8.152582231571509e-07, "loss": 9.574628347763792e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 454.375, "completions/min_length": 367.0, "epoch": 7.0926470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.028314964845776558, "kl": 0.009484049514867365, "learning_rate": 8.151586037678568e-07, "loss": 9.492460230831057e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/mean_length": 529.25, "completions/min_length": 464.0, "epoch": 7.094117647058823, "frac_reward_zero_std": 0.0, "grad_norm": 1.361861228942871, "kl": 0.008624908165074885, "learning_rate": 8.150589636167113e-07, "loss": 8.624792098999023e-05, "reward": 0.5637041330337524, "reward_std": 0.19715355336666107, "rewards/DrugCombAccuracyCOTORM/mean": 0.4839642643928528, "rewards/DrugCombAccuracyCOTORM/std": 0.4164040684700012, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7653273940086365, "rewards/DrugCombCoverageCOTORM/std": 0.2302032858133316, "step": 4824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 442.8125, "completions/min_length": 399.0, "epoch": 7.095588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9943713545799255, "kl": 0.009448755532503128, "learning_rate": 8.149593027102788e-07, "loss": 9.468092321185395e-05, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 4825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 472.875, "completions/min_length": 421.0, "epoch": 7.097058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.017938116565346718, "kl": 0.009482994675636292, "learning_rate": 8.148596210551244e-07, "loss": 9.511217649560422e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 432.8125, "completions/min_length": 365.0, "epoch": 7.098529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9108498692512512, "kl": 0.008165323291905224, "learning_rate": 8.14759918657815e-07, "loss": 8.19296037661843e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 460.5625, "completions/min_length": 397.0, "epoch": 7.1, "frac_reward_zero_std": 1.0, "grad_norm": 0.012873046100139618, "kl": 0.007840462727472186, "learning_rate": 8.146601955249187e-07, "loss": 7.811858085915446e-05, "reward": 0.05000000074505806, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 4828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 463.25, "completions/min_length": 400.0, "epoch": 7.101470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.5433021783828735, "kl": 0.012960396357811987, "learning_rate": 8.145604516630051e-07, "loss": 0.0001331418752670288, "reward": 0.78125, "reward_std": 0.3743184804916382, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 4829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 449.8125, "completions/min_length": 388.0, "epoch": 7.102941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.7242279052734375, "kl": 0.008229874423705041, "learning_rate": 8.144606870786448e-07, "loss": 8.255867578554899e-05, "reward": 0.550000011920929, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 481.4375, "completions/min_length": 441.0, "epoch": 7.104411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.2317724227905273, "kl": 0.01170074357651174, "learning_rate": 8.143609017784105e-07, "loss": 0.00011914622155018151, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 441.5625, "completions/min_length": 349.0, "epoch": 7.105882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.0860989093780518, "kl": 0.008079069899395108, "learning_rate": 8.142610957688754e-07, "loss": 8.069393516052514e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 432.125, "completions/min_length": 365.0, "epoch": 7.107352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.03936529532074928, "kl": 0.007307455758564174, "learning_rate": 8.141612690566146e-07, "loss": 7.385252683889121e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 438.875, "completions/min_length": 382.0, "epoch": 7.108823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.1162570714950562, "kl": 0.007147708907723427, "learning_rate": 8.140614216482044e-07, "loss": 7.179379463195801e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 472.9375, "completions/min_length": 422.0, "epoch": 7.110294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.2133761644363403, "kl": 0.00797192903701216, "learning_rate": 8.139615535502227e-07, "loss": 7.99819827079773e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 4835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 460.6875, "completions/min_length": 391.0, "epoch": 7.1117647058823525, "frac_reward_zero_std": 1.0, "grad_norm": 0.013178733177483082, "kl": 0.007687651086598635, "learning_rate": 8.138616647692482e-07, "loss": 7.733426900813356e-05, "reward": 0.15000000596046448, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 4836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 415.6875, "completions/min_length": 372.0, "epoch": 7.113235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0686748027801514, "kl": 0.02343095652759075, "learning_rate": 8.137617553118617e-07, "loss": 0.00023712217807769775, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 4837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 487.1875, "completions/min_length": 413.0, "epoch": 7.114705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9530360102653503, "kl": 0.009339278331026435, "learning_rate": 8.136618251846446e-07, "loss": 9.305402636528015e-05, "reward": 0.9125000238418579, "reward_std": 0.18077215552330017, "rewards/DrugCombAccuracyCOTORM/mean": 0.90625, "rewards/DrugCombAccuracyCOTORM/std": 0.2719528079032898, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 469.875, "completions/min_length": 408.0, "epoch": 7.116176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.049749493598938, "kl": 0.008036382379941642, "learning_rate": 8.135618743941803e-07, "loss": 8.009593875613064e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 432.375, "completions/min_length": 386.0, "epoch": 7.117647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.0360878705978394, "kl": 0.01099511666689068, "learning_rate": 8.134619029470533e-07, "loss": 0.00011007712600985542, "reward": 0.6875, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 4840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 479.25, "completions/min_length": 415.0, "epoch": 7.1191176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.018335459753870964, "kl": 0.009459944441914558, "learning_rate": 8.133619108498492e-07, "loss": 9.461986337555572e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 426.875, "completions/min_length": 344.0, "epoch": 7.120588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.011586823500692844, "kl": 0.0071218840312212706, "learning_rate": 8.132618981091557e-07, "loss": 7.109713624231517e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 482.8125, "completions/min_length": 391.0, "epoch": 7.122058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.5277156829833984, "kl": 0.010344669688493013, "learning_rate": 8.13161864731561e-07, "loss": 0.00010317564010620117, "reward": 0.8729166984558105, "reward_std": 0.2348230481147766, "rewards/DrugCombAccuracyCOTORM/mean": 0.8541666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.2713136672973633, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 417.25, "completions/min_length": 351.0, "epoch": 7.123529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.724052369594574, "kl": 0.007412039907649159, "learning_rate": 8.130618107236551e-07, "loss": 7.345527410507202e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 438.4375, "completions/min_length": 367.0, "epoch": 7.125, "frac_reward_zero_std": 1.0, "grad_norm": 0.09369821846485138, "kl": 0.009095476474612951, "learning_rate": 8.129617360920296e-07, "loss": 9.115155262406915e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 429.25, "completions/min_length": 360.0, "epoch": 7.126470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.08978473395109177, "kl": 0.011413141619414091, "learning_rate": 8.128616408432767e-07, "loss": 0.0001170327013824135, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 451.4375, "completions/min_length": 371.0, "epoch": 7.127941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9686146378517151, "kl": 0.007700321264564991, "learning_rate": 8.127615249839907e-07, "loss": 7.655471563339233e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 480.0625, "completions/min_length": 398.0, "epoch": 7.129411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.015779949724674225, "kl": 0.007808444555848837, "learning_rate": 8.12661388520767e-07, "loss": 7.814752461854368e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 850.0, "completions/mean_length": 574.3125, "completions/min_length": 405.0, "epoch": 7.1308823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.1921453475952148, "kl": 0.012589956866577268, "learning_rate": 8.125612314602022e-07, "loss": 0.00012987852096557617, "reward": 0.5817708373069763, "reward_std": 0.019408093765378, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.27216553688049316, "step": 4849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 457.0, "completions/min_length": 417.0, "epoch": 7.132352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.11287829279899597, "kl": 0.012703662388958037, "learning_rate": 8.124610538088946e-07, "loss": 0.0001272410008823499, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 440.0625, "completions/min_length": 371.0, "epoch": 7.133823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.01709332875907421, "kl": 0.009814739809371531, "learning_rate": 8.123608555734434e-07, "loss": 9.8196673206985e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 457.75, "completions/min_length": 389.0, "epoch": 7.135294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.024483654648065567, "kl": 0.00798122095875442, "learning_rate": 8.122606367604496e-07, "loss": 7.963833922985941e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 463.1875, "completions/min_length": 431.0, "epoch": 7.136764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.010769804939627647, "kl": 0.006442291662096977, "learning_rate": 8.121603973765152e-07, "loss": 6.481514719780535e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 465.75, "completions/min_length": 399.0, "epoch": 7.1382352941176475, "frac_reward_zero_std": 1.0, "grad_norm": 0.022583195939660072, "kl": 0.009276413125917315, "learning_rate": 8.120601374282438e-07, "loss": 9.248239803127944e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 432.0, "completions/min_length": 379.0, "epoch": 7.139705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.018071157857775688, "kl": 0.009673272492364049, "learning_rate": 8.119598569222403e-07, "loss": 9.737078653415665e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/mean_length": 474.5625, "completions/min_length": 327.0, "epoch": 7.141176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.786834716796875, "kl": 0.008332489058375359, "learning_rate": 8.118595558651109e-07, "loss": 8.324151713168249e-05, "reward": 0.9637500047683716, "reward_std": 0.05002973601222038, "rewards/DrugCombAccuracyCOTORM/mean": 0.9624999761581421, "rewards/DrugCombAccuracyCOTORM/std": 0.08062257617712021, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.13437095284461975, "step": 4856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 492.3125, "completions/min_length": 394.0, "epoch": 7.142647058823529, "frac_reward_zero_std": 0.0, "grad_norm": 1.1525945663452148, "kl": 0.009402063791640103, "learning_rate": 8.117592342634632e-07, "loss": 9.47415828704834e-05, "reward": 0.9020833373069763, "reward_std": 0.1683279275894165, "rewards/DrugCombAccuracyCOTORM/mean": 0.8854166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.2083333432674408, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 441.875, "completions/min_length": 401.0, "epoch": 7.144117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.7614479064941406, "kl": 0.008142477134242654, "learning_rate": 8.11658892123906e-07, "loss": 8.095055818557739e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 452.375, "completions/min_length": 410.0, "epoch": 7.145588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.0762978792190552, "kl": 0.008054468897171319, "learning_rate": 8.115585294530498e-07, "loss": 8.100535342236981e-05, "reward": 0.5, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 4859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 459.9375, "completions/min_length": 413.0, "epoch": 7.147058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.010718696750700474, "kl": 0.007165140355937183, "learning_rate": 8.114581462575062e-07, "loss": 7.146326242946088e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 470.9375, "completions/min_length": 421.0, "epoch": 7.148529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9506780505180359, "kl": 0.00970827683340758, "learning_rate": 8.11357742543888e-07, "loss": 9.673088788986206e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 436.625, "completions/min_length": 376.0, "epoch": 7.15, "frac_reward_zero_std": 0.5, "grad_norm": 0.9322237968444824, "kl": 0.008202596101909876, "learning_rate": 8.112573183188098e-07, "loss": 8.253848500316963e-05, "reward": 0.30000001192092896, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 4862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 450.625, "completions/min_length": 379.0, "epoch": 7.151470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9479128122329712, "kl": 0.010317971231415868, "learning_rate": 8.111568735888871e-07, "loss": 0.00010360240412410349, "reward": 0.7250000238418579, "reward_std": 0.2299068123102188, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.5773502588272095, "step": 4863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 460.125, "completions/min_length": 393.0, "epoch": 7.152941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.020418286323547363, "kl": 0.011161086382344365, "learning_rate": 8.110564083607369e-07, "loss": 0.00011242330947425216, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/mean_length": 392.8125, "completions/min_length": 353.0, "epoch": 7.154411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.014460518024861813, "kl": 0.00883986777625978, "learning_rate": 8.109559226409779e-07, "loss": 8.795989560894668e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 4865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 463.625, "completions/min_length": 422.0, "epoch": 7.155882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 4.014032363891602, "kl": 0.08114625315647572, "learning_rate": 8.108554164362295e-07, "loss": 0.0008065644651651382, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/mean_length": 494.3125, "completions/min_length": 411.0, "epoch": 7.1573529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.9652722477912903, "kl": 0.00842259346973151, "learning_rate": 8.107548897531129e-07, "loss": 8.488446474075317e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 476.25, "completions/min_length": 409.0, "epoch": 7.158823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.9370170831680298, "kl": 0.010075493133626878, "learning_rate": 8.106543425982507e-07, "loss": 0.00010231269698124379, "reward": 0.9551249742507935, "reward_std": 0.12692566215991974, "rewards/DrugCombAccuracyCOTORM/mean": 0.9478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.20874999463558197, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 4868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 455.0, "completions/min_length": 369.0, "epoch": 7.160294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9022960662841797, "kl": 0.009227694710716605, "learning_rate": 8.105537749782665e-07, "loss": 9.191036224365234e-05, "reward": 0.5796874761581421, "reward_std": 0.03892385959625244, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5439056158065796, "step": 4869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 464.375, "completions/min_length": 395.0, "epoch": 7.161764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0243512392044067, "kl": 0.009985384065657854, "learning_rate": 8.104531868997857e-07, "loss": 9.934419358614832e-05, "reward": 0.7557916641235352, "reward_std": 0.20408160984516144, "rewards/DrugCombAccuracyCOTORM/mean": 0.7012500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.46046173572540283, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9479166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.145535409450531, "step": 4870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/mean_length": 518.0625, "completions/min_length": 438.0, "epoch": 7.163235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9089464545249939, "kl": 0.010109704453498125, "learning_rate": 8.103525783694345e-07, "loss": 0.00010200417455052957, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 4871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/mean_length": 499.625, "completions/min_length": 422.0, "epoch": 7.1647058823529415, "frac_reward_zero_std": 0.5, "grad_norm": 1.2162611484527588, "kl": 0.01067363191395998, "learning_rate": 8.102519493938406e-07, "loss": 0.00010795157868415117, "reward": 0.612500011920929, "reward_std": 0.1642080694437027, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 4872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 451.125, "completions/min_length": 401.0, "epoch": 7.166176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.958702027797699, "kl": 0.010990347480401397, "learning_rate": 8.101512999796339e-07, "loss": 0.00011000990343745798, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 450.9375, "completions/min_length": 397.0, "epoch": 7.16764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.010708460584282875, "kl": 0.007987149525433779, "learning_rate": 8.100506301334439e-07, "loss": 8.044015703490004e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/mean_length": 507.3125, "completions/min_length": 418.0, "epoch": 7.169117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.3625545501708984, "kl": 0.016209929599426687, "learning_rate": 8.099499398619034e-07, "loss": 0.0001660375128267333, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 4875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 471.125, "completions/min_length": 442.0, "epoch": 7.170588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.011242641136050224, "kl": 0.008212661603465676, "learning_rate": 8.098492291716452e-07, "loss": 8.184670878108591e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 492.0625, "completions/min_length": 427.0, "epoch": 7.172058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.3556580543518066, "kl": 0.00758294970728457, "learning_rate": 8.097484980693039e-07, "loss": 7.553398609161377e-05, "reward": 0.9074000120162964, "reward_std": 0.26191234588623047, "rewards/DrugCombAccuracyCOTORM/mean": 0.887374997138977, "rewards/DrugCombAccuracyCOTORM/std": 0.3098659813404083, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9750000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.10000000149011612, "step": 4877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 445.25, "completions/min_length": 399.0, "epoch": 7.173529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.2894366979599, "kl": 0.010713339783251286, "learning_rate": 8.096477465615154e-07, "loss": 0.00010770559310913086, "reward": 0.7749999761581421, "reward_std": 0.41661906242370605, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 4878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/mean_length": 488.5, "completions/min_length": 342.0, "epoch": 7.175, "frac_reward_zero_std": 0.5, "grad_norm": 0.9796887040138245, "kl": 0.008985368651337922, "learning_rate": 8.095469746549171e-07, "loss": 8.952245116233826e-05, "reward": 0.9085520505905151, "reward_std": 0.0765765830874443, "rewards/DrugCombAccuracyCOTORM/mean": 0.8905729055404663, "rewards/DrugCombAccuracyCOTORM/std": 0.16875270009040833, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9609375, "rewards/DrugCombCoverageCOTORM/std": 0.059839196503162384, "step": 4879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/mean_length": 513.0, "completions/min_length": 364.0, "epoch": 7.176470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.758033037185669, "kl": 0.0118556113447994, "learning_rate": 8.094461823561472e-07, "loss": 0.00011913478374481201, "reward": 0.8442803025245667, "reward_std": 0.09293168038129807, "rewards/DrugCombAccuracyCOTORM/mean": 0.8079545497894287, "rewards/DrugCombAccuracyCOTORM/std": 0.2537623345851898, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.05821416527032852, "step": 4880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 462.4375, "completions/min_length": 399.0, "epoch": 7.177941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.22938665747642517, "kl": 0.013713632943108678, "learning_rate": 8.093453696718462e-07, "loss": 0.0001358323497697711, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/mean_length": 547.0, "completions/min_length": 448.0, "epoch": 7.179411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.6479448080062866, "kl": 0.011170906596817076, "learning_rate": 8.092445366086549e-07, "loss": 0.0001152608310803771, "reward": 0.9804062843322754, "reward_std": 0.05541948601603508, "rewards/DrugCombAccuracyCOTORM/mean": 0.9764843583106995, "rewards/DrugCombAccuracyCOTORM/std": 0.09406250715255737, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9921875, "rewards/DrugCombCoverageCOTORM/std": 0.03125, "step": 4882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/mean_length": 395.75, "completions/min_length": 316.0, "epoch": 7.180882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.00993691198527813, "kl": 0.007622828357852995, "learning_rate": 8.091436831732161e-07, "loss": 7.61627234169282e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 4883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 454.3125, "completions/min_length": 409.0, "epoch": 7.182352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.017597099766135216, "kl": 0.009166458854451776, "learning_rate": 8.09042809372174e-07, "loss": 9.175467130262405e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 459.5625, "completions/min_length": 352.0, "epoch": 7.1838235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 1.0691863298416138, "kl": 0.007599646341986954, "learning_rate": 8.089419152121736e-07, "loss": 7.568127330159768e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 477.75, "completions/min_length": 422.0, "epoch": 7.185294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.012662672437727451, "kl": 0.008653607685118914, "learning_rate": 8.088410006998616e-07, "loss": 8.707021333975717e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 437.25, "completions/min_length": 365.0, "epoch": 7.186764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.015088039450347424, "kl": 0.010301071684807539, "learning_rate": 8.087400658418861e-07, "loss": 0.0001033990120049566, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/mean_length": 501.4375, "completions/min_length": 402.0, "epoch": 7.188235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0958020687103271, "kl": 0.008622685796581209, "learning_rate": 8.086391106448964e-07, "loss": 8.630752563476562e-05, "reward": 0.6512500047683716, "reward_std": 0.11870916932821274, "rewards/DrugCombAccuracyCOTORM/mean": 0.5979166626930237, "rewards/DrugCombAccuracyCOTORM/std": 0.4571196436882019, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7291666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.32702362537384033, "step": 4888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 461.1875, "completions/min_length": 354.0, "epoch": 7.189705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.0053842067718506, "kl": 0.007580550387501717, "learning_rate": 8.085381351155433e-07, "loss": 7.530393486376852e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 461.375, "completions/min_length": 401.0, "epoch": 7.1911764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.018591001629829407, "kl": 0.009159037726931274, "learning_rate": 8.084371392604785e-07, "loss": 9.13480034796521e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 460.5, "completions/min_length": 376.0, "epoch": 7.192647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.04834093526005745, "kl": 0.009403546224348247, "learning_rate": 8.083361230863556e-07, "loss": 9.423497249372303e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/mean_length": 551.5, "completions/min_length": 439.0, "epoch": 7.194117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.8974113464355469, "kl": 0.009228513925336301, "learning_rate": 8.082350865998291e-07, "loss": 9.162643254967406e-05, "reward": 0.6817141771316528, "reward_std": 0.10970587283372879, "rewards/DrugCombAccuracyCOTORM/mean": 0.6223250031471252, "rewards/DrugCombAccuracyCOTORM/std": 0.42359673976898193, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.27752211689949036, "step": 4892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 490.6875, "completions/min_length": 449.0, "epoch": 7.195588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.3855390548706055, "kl": 0.009351163986139, "learning_rate": 8.081340298075552e-07, "loss": 9.407848119735718e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 437.0625, "completions/min_length": 367.0, "epoch": 7.197058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.8549314737319946, "kl": 0.007717946427874267, "learning_rate": 8.080329527161913e-07, "loss": 7.788146467646584e-05, "reward": 0.7094500064849854, "reward_std": 0.18264131247997284, "rewards/DrugCombAccuracyCOTORM/mean": 0.6414999961853027, "rewards/DrugCombAccuracyCOTORM/std": 0.4820331931114197, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9624999761581421, "rewards/DrugCombCoverageCOTORM/std": 0.08062257617712021, "step": 4894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 481.0, "completions/min_length": 401.0, "epoch": 7.198529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.201601505279541, "kl": 0.010643023299053311, "learning_rate": 8.079318553323959e-07, "loss": 0.00010515638859942555, "reward": 0.8666666746139526, "reward_std": 0.16390569508075714, "rewards/DrugCombAccuracyCOTORM/mean": 0.8541666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.2713136672973633, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.49441322684288025, "step": 4895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 461.5, "completions/min_length": 403.0, "epoch": 7.2, "frac_reward_zero_std": 0.5, "grad_norm": 0.9902651906013489, "kl": 0.008403632324188948, "learning_rate": 8.07830737662829e-07, "loss": 8.495058864355087e-05, "reward": 0.7937500476837158, "reward_std": 0.22109711170196533, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 495.1875, "completions/min_length": 420.0, "epoch": 7.201470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.882932722568512, "kl": 0.008385611115954816, "learning_rate": 8.077295997141525e-07, "loss": 8.41145811136812e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 465.4375, "completions/min_length": 412.0, "epoch": 7.202941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.01840563490986824, "kl": 0.008766786428168416, "learning_rate": 8.076284414930286e-07, "loss": 8.626597264083102e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 523.875, "completions/min_length": 471.0, "epoch": 7.204411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.905663788318634, "kl": 0.007767359958961606, "learning_rate": 8.075272630061213e-07, "loss": 7.749348878860474e-05, "reward": 0.8790919184684753, "reward_std": 0.0552716888487339, "rewards/DrugCombAccuracyCOTORM/mean": 0.8648805022239685, "rewards/DrugCombAccuracyCOTORM/std": 0.1646866351366043, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.871874988079071, "rewards/DrugCombCoverageCOTORM/std": 0.1505199372768402, "step": 4899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 429.8125, "completions/min_length": 386.0, "epoch": 7.205882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.016043655574321747, "kl": 0.008635009755380452, "learning_rate": 8.074260642600963e-07, "loss": 8.686182263772935e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/mean_length": 479.25, "completions/min_length": 395.0, "epoch": 7.20735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0790172815322876, "kl": 0.008599875029176474, "learning_rate": 8.073248452616201e-07, "loss": 8.517307287547737e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 473.5, "completions/min_length": 395.0, "epoch": 7.208823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.03380923718214035, "kl": 0.010852135019376874, "learning_rate": 8.07223606017361e-07, "loss": 0.00010869625839404762, "reward": 0.6410000324249268, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5824999809265137, "rewards/DrugCombAccuracyCOTORM/std": 0.43119215965270996, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 4902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 472.25, "completions/min_length": 379.0, "epoch": 7.2102941176470585, "frac_reward_zero_std": 1.0, "grad_norm": 0.0193401500582695, "kl": 0.011035825591534376, "learning_rate": 8.07122346533988e-07, "loss": 0.0001098731518140994, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 450.9375, "completions/min_length": 371.0, "epoch": 7.211764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.01872437447309494, "kl": 0.009310330613516271, "learning_rate": 8.070210668181722e-07, "loss": 9.27651853999123e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 469.5625, "completions/min_length": 393.0, "epoch": 7.213235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.02249421924352646, "kl": 0.011240492109209299, "learning_rate": 8.069197668765855e-07, "loss": 0.0001115881314035505, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 456.0, "completions/min_length": 383.0, "epoch": 7.214705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.0156488418579102, "kl": 0.010883571114391088, "learning_rate": 8.068184467159013e-07, "loss": 0.00011040270328521729, "reward": 0.5734999775886536, "reward_std": 0.05458850413560867, "rewards/DrugCombAccuracyCOTORM/mean": 0.5137500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.5050000548362732, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.6540472507476807, "step": 4906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 403.9375, "completions/min_length": 372.0, "epoch": 7.216176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.0663743019104004, "kl": 0.008962183026596904, "learning_rate": 8.067171063427942e-07, "loss": 8.90602168510668e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 465.9375, "completions/min_length": 387.0, "epoch": 7.2176470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.02118518017232418, "kl": 0.009800518862903118, "learning_rate": 8.066157457639405e-07, "loss": 9.702226088847965e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 669.0, "completions/mean_length": 497.125, "completions/min_length": 360.0, "epoch": 7.219117647058823, "frac_reward_zero_std": 0.0, "grad_norm": 1.657894492149353, "kl": 0.014323957497254014, "learning_rate": 8.065143649860171e-07, "loss": 0.0001438334584236145, "reward": 0.606249988079071, "reward_std": 0.36740854382514954, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 4909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 441.375, "completions/min_length": 394.0, "epoch": 7.220588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9827166795730591, "kl": 0.008591555524617434, "learning_rate": 8.064129640157033e-07, "loss": 8.622556924819946e-05, "reward": 0.960812509059906, "reward_std": 0.11083897948265076, "rewards/DrugCombAccuracyCOTORM/mean": 0.9529687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.18812499940395355, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.0625, "step": 4910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/mean_length": 483.75, "completions/min_length": 363.0, "epoch": 7.222058823529411, "frac_reward_zero_std": 0.0, "grad_norm": 1.252733826637268, "kl": 0.008835701970383525, "learning_rate": 8.063115428596787e-07, "loss": 8.796155452728271e-05, "reward": 0.6881159543991089, "reward_std": 0.21877719461917877, "rewards/DrugCombAccuracyCOTORM/mean": 0.6101449131965637, "rewards/DrugCombAccuracyCOTORM/std": 0.4915757477283478, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 421.875, "completions/min_length": 312.0, "epoch": 7.223529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.237953782081604, "kl": 0.009043537429533899, "learning_rate": 8.06210101524625e-07, "loss": 9.075552225112915e-05, "reward": 0.7257708311080933, "reward_std": 0.1274711787700653, "rewards/DrugCombAccuracyCOTORM/mean": 0.676744818687439, "rewards/DrugCombAccuracyCOTORM/std": 0.39275607466697693, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.21489663422107697, "step": 4912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/mean_length": 563.0, "completions/min_length": 454.0, "epoch": 7.225, "frac_reward_zero_std": 0.5, "grad_norm": 0.7948526740074158, "kl": 0.009144030744209886, "learning_rate": 8.061086400172245e-07, "loss": 9.094923734664917e-05, "reward": 0.16875000298023224, "reward_std": 0.04580627381801605, "rewards/DrugCombAccuracyCOTORM/mean": 0.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.704154372215271, "step": 4913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 467.75, "completions/min_length": 388.0, "epoch": 7.226470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.008123369887471199, "kl": 0.007223755354061723, "learning_rate": 8.060071583441617e-07, "loss": 7.245408778544515e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 437.875, "completions/min_length": 390.0, "epoch": 7.227941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.005937932524830103, "kl": 0.00497209484456107, "learning_rate": 8.059056565121216e-07, "loss": 4.939482460031286e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 420.75, "completions/min_length": 360.0, "epoch": 7.229411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.013757736422121525, "kl": 0.008276091888546944, "learning_rate": 8.05804134527791e-07, "loss": 8.363217057194561e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 471.8125, "completions/min_length": 396.0, "epoch": 7.230882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.020123304799199104, "kl": 0.010583550203591585, "learning_rate": 8.05702592397858e-07, "loss": 0.00010630708857206628, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 472.1875, "completions/min_length": 408.0, "epoch": 7.232352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.7358335852622986, "kl": 0.009815562050789595, "learning_rate": 8.056010301290117e-07, "loss": 9.920174488797784e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 415.625, "completions/min_length": 388.0, "epoch": 7.233823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 1.407726526260376, "kl": 0.010215124813839793, "learning_rate": 8.054994477279431e-07, "loss": 0.00010225921869277954, "reward": 0.8678333163261414, "reward_std": 0.29356443881988525, "rewards/DrugCombAccuracyCOTORM/mean": 0.8400000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.3471022844314575, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 4919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 468.5625, "completions/min_length": 420.0, "epoch": 7.235294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.136676549911499, "kl": 0.01113667106255889, "learning_rate": 8.053978452013439e-07, "loss": 0.00011174380779266357, "reward": 0.824999988079071, "reward_std": 0.24348658323287964, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 4920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/mean_length": 533.8125, "completions/min_length": 441.0, "epoch": 7.2367647058823525, "frac_reward_zero_std": 0.5, "grad_norm": 0.8576785326004028, "kl": 0.008935129386372864, "learning_rate": 8.052962225559074e-07, "loss": 8.816880290396512e-05, "reward": 0.7250000238418579, "reward_std": 0.24928468465805054, "rewards/DrugCombAccuracyCOTORM/mean": 0.71875, "rewards/DrugCombAccuracyCOTORM/std": 0.44604745507240295, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 4921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 434.5625, "completions/min_length": 393.0, "epoch": 7.238235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1055030822753906, "kl": 0.013143116375431418, "learning_rate": 8.051945797983286e-07, "loss": 0.00013322383165359497, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 446.75, "completions/min_length": 358.0, "epoch": 7.239705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.015099178068339825, "kl": 0.007320894277654588, "learning_rate": 8.050929169353032e-07, "loss": 7.325397746171802e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 447.5625, "completions/min_length": 398.0, "epoch": 7.241176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.7697491645812988, "kl": 0.008334863698109984, "learning_rate": 8.049912339735283e-07, "loss": 8.340924978256226e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 475.375, "completions/min_length": 422.0, "epoch": 7.242647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.046370021998882294, "kl": 0.011957254959270358, "learning_rate": 8.04889530919703e-07, "loss": 0.00012003265146631747, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 444.375, "completions/min_length": 400.0, "epoch": 7.2441176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.0097392862662673, "kl": 0.0070348383160308, "learning_rate": 8.047878077805268e-07, "loss": 6.998531171120703e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 515.3125, "completions/min_length": 467.0, "epoch": 7.245588235294117, "frac_reward_zero_std": 0.0, "grad_norm": 1.3197752237319946, "kl": 0.010660632280632854, "learning_rate": 8.046860645627013e-07, "loss": 0.00010658800601959229, "reward": 0.7464166879653931, "reward_std": 0.3457193374633789, "rewards/DrugCombAccuracyCOTORM/mean": 0.7012500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.46046173572540283, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5013870000839233, "step": 4927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 453.25, "completions/min_length": 400.0, "epoch": 7.247058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.1288906335830688, "kl": 0.010464666411280632, "learning_rate": 8.045843012729287e-07, "loss": 0.00010412951814942062, "reward": 0.8427083492279053, "reward_std": 0.16781966388225555, "rewards/DrugCombAccuracyCOTORM/mean": 0.8229166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.30103984475135803, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.5072392821311951, "step": 4928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 441.0625, "completions/min_length": 403.0, "epoch": 7.248529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.010637406259775162, "kl": 0.0075259346049278975, "learning_rate": 8.044825179179134e-07, "loss": 7.480074418708682e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 432.375, "completions/min_length": 377.0, "epoch": 7.25, "frac_reward_zero_std": 1.0, "grad_norm": 0.01778401806950569, "kl": 0.007679524947889149, "learning_rate": 8.043807145043603e-07, "loss": 7.661413110326976e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 440.1875, "completions/min_length": 386.0, "epoch": 7.251470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.008494952693581581, "kl": 0.007474535144865513, "learning_rate": 8.04278891038976e-07, "loss": 7.490688585676253e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 456.5625, "completions/min_length": 395.0, "epoch": 7.252941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.7895170450210571, "kl": 0.009539823629893363, "learning_rate": 8.041770475284683e-07, "loss": 9.65896833804436e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 428.375, "completions/min_length": 384.0, "epoch": 7.254411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.015370624139904976, "kl": 0.007457411149516702, "learning_rate": 8.040751839795465e-07, "loss": 7.45700453990139e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 429.875, "completions/min_length": 312.0, "epoch": 7.2558823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.2721302509307861, "kl": 0.008798084454610944, "learning_rate": 8.039733003989213e-07, "loss": 8.7738037109375e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 437.6875, "completions/min_length": 394.0, "epoch": 7.257352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.013492793776094913, "kl": 0.00783117744140327, "learning_rate": 8.038713967933042e-07, "loss": 7.857558375690132e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 461.3125, "completions/min_length": 423.0, "epoch": 7.258823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.02296987734735012, "kl": 0.008070127572864294, "learning_rate": 8.037694731694084e-07, "loss": 8.082696876954287e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 513.5625, "completions/min_length": 446.0, "epoch": 7.260294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.1920502185821533, "kl": 0.013913344824686646, "learning_rate": 8.036675295339486e-07, "loss": 0.00013905763626098633, "reward": 0.75, "reward_std": 0.20701967179775238, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 409.125, "completions/min_length": 334.0, "epoch": 7.261764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.012014527805149555, "kl": 0.008330525481142104, "learning_rate": 8.035655658936403e-07, "loss": 8.260120375780389e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 447.6875, "completions/min_length": 408.0, "epoch": 7.2632352941176475, "frac_reward_zero_std": 0.5, "grad_norm": 0.9916457533836365, "kl": 0.008897965657524765, "learning_rate": 8.034635822552007e-07, "loss": 8.97447025636211e-05, "reward": 0.9833333492279053, "reward_std": 0.047140445560216904, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/mean_length": 453.6875, "completions/min_length": 377.0, "epoch": 7.264705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 1.3737683296203613, "kl": 0.009358056937344372, "learning_rate": 8.033615786253485e-07, "loss": 9.407475590705872e-05, "reward": 0.6034375429153442, "reward_std": 0.4286407232284546, "rewards/DrugCombAccuracyCOTORM/mean": 0.5257812738418579, "rewards/DrugCombAccuracyCOTORM/std": 0.4943242371082306, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.828125, "rewards/DrugCombCoverageCOTORM/std": 0.3502231538295746, "step": 4940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/mean_length": 528.0, "completions/min_length": 411.0, "epoch": 7.266176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.371835708618164, "kl": 0.010351930162869394, "learning_rate": 8.032595550108031e-07, "loss": 0.00010360777378082275, "reward": 0.734375, "reward_std": 0.42111557722091675, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 0.9375, "rewards/DrugCombCOTFormatORM/std": 0.17078252136707306, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 4941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 440.1875, "completions/min_length": 390.0, "epoch": 7.267647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.8549755811691284, "kl": 0.010053366888314486, "learning_rate": 8.031575114182855e-07, "loss": 0.00010018680768553168, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 420.5625, "completions/min_length": 387.0, "epoch": 7.269117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.024824736639857292, "kl": 0.011030119145289063, "learning_rate": 8.030554478545186e-07, "loss": 0.00010998762445524335, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 445.3125, "completions/min_length": 366.0, "epoch": 7.270588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.6527775526046753, "kl": 0.014197828248143196, "learning_rate": 8.029533643262255e-07, "loss": 0.00014061108231544495, "reward": 0.5954999923706055, "reward_std": 0.42811667919158936, "rewards/DrugCombAccuracyCOTORM/mean": 0.5412499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.47608789801597595, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.41231057047843933, "step": 4944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/mean_length": 450.8125, "completions/min_length": 363.0, "epoch": 7.272058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.8227199912071228, "kl": 0.00919437501579523, "learning_rate": 8.028512608401314e-07, "loss": 9.355694055557251e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 511.4375, "completions/min_length": 424.0, "epoch": 7.273529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.3024412393569946, "kl": 0.008423826657235622, "learning_rate": 8.027491374029629e-07, "loss": 8.487701416015625e-05, "reward": 0.5606470108032227, "reward_std": 0.2619023323059082, "rewards/DrugCombAccuracyCOTORM/mean": 0.5126577615737915, "rewards/DrugCombAccuracyCOTORM/std": 0.3091716468334198, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5052083730697632, "rewards/DrugCombCoverageCOTORM/std": 0.4892064034938812, "step": 4946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 427.6875, "completions/min_length": 399.0, "epoch": 7.275, "frac_reward_zero_std": 1.0, "grad_norm": 0.01030771154910326, "kl": 0.0070466771721839905, "learning_rate": 8.02646994021447e-07, "loss": 7.013234426267445e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 452.125, "completions/min_length": 359.0, "epoch": 7.276470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.7797607779502869, "kl": 0.00815811741631478, "learning_rate": 8.025448307023135e-07, "loss": 8.2358717918396e-05, "reward": 0.30133333802223206, "reward_std": 0.03111269697546959, "rewards/DrugCombAccuracyCOTORM/mean": 0.17875000834465027, "rewards/DrugCombAccuracyCOTORM/std": 0.054999999701976776, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5833333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.08606630563735962, "step": 4948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/mean_length": 528.6875, "completions/min_length": 423.0, "epoch": 7.277941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 0.7729573845863342, "kl": 0.009104703203774989, "learning_rate": 8.024426474522918e-07, "loss": 9.305030107498169e-05, "reward": 0.7719107270240784, "reward_std": 0.07944849878549576, "rewards/DrugCombAccuracyCOTORM/mean": 0.7266071438789368, "rewards/DrugCombAccuracyCOTORM/std": 0.2964767515659332, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.375, "step": 4949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 477.125, "completions/min_length": 413.0, "epoch": 7.279411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.010054211132228374, "kl": 0.007413632003590465, "learning_rate": 8.02340444278114e-07, "loss": 7.379833550658077e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 531.9375, "completions/min_length": 415.0, "epoch": 7.280882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.0442196130752563, "kl": 0.009587190230377018, "learning_rate": 8.02238221186513e-07, "loss": 9.543762280372903e-05, "reward": 0.8648750185966492, "reward_std": 0.0854184553027153, "rewards/DrugCombAccuracyCOTORM/mean": 0.8402083516120911, "rewards/DrugCombAccuracyCOTORM/std": 0.21844728291034698, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9270833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.1717960685491562, "step": 4951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 419.875, "completions/min_length": 340.0, "epoch": 7.2823529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.012395905330777168, "kl": 0.007794416509568691, "learning_rate": 8.021359781842228e-07, "loss": 7.788468792568892e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 462.9375, "completions/min_length": 391.0, "epoch": 7.283823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.03309885412454605, "kl": 0.010371650569140911, "learning_rate": 8.020337152779789e-07, "loss": 0.00010362053581047803, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 455.0, "completions/min_length": 411.0, "epoch": 7.285294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.011283891275525093, "kl": 0.008096119505353272, "learning_rate": 8.019314324745182e-07, "loss": 8.10554702184163e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 417.1875, "completions/min_length": 367.0, "epoch": 7.286764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.016861703246831894, "kl": 0.00871128763537854, "learning_rate": 8.018291297805789e-07, "loss": 8.745474042370915e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 4955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 488.375, "completions/min_length": 441.0, "epoch": 7.288235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.1390328407287598, "kl": 0.007996437605470419, "learning_rate": 8.017268072029002e-07, "loss": 7.958710193634033e-05, "reward": 0.660812497138977, "reward_std": 0.29600298404693604, "rewards/DrugCombAccuracyCOTORM/mean": 0.5779687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.4977610111236572, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.0625, "step": 4956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/mean_length": 531.125, "completions/min_length": 450.0, "epoch": 7.2897058823529415, "frac_reward_zero_std": 0.5, "grad_norm": 0.976207435131073, "kl": 0.011272752191871405, "learning_rate": 8.016244647482231e-07, "loss": 0.00011283688218099996, "reward": 0.9513333439826965, "reward_std": 0.13765011727809906, "rewards/DrugCombAccuracyCOTORM/mean": 0.9443750381469727, "rewards/DrugCombAccuracyCOTORM/std": 0.2224999964237213, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.1666666567325592, "step": 4957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/mean_length": 525.3125, "completions/min_length": 419.0, "epoch": 7.291176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.897760808467865, "kl": 0.010675553232431412, "learning_rate": 8.015221024232897e-07, "loss": 0.00010669976472854614, "reward": 0.7589166760444641, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.7012500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.28052034974098206, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 4958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 482.375, "completions/min_length": 436.0, "epoch": 7.29264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.016127346083521843, "kl": 0.009944354766048491, "learning_rate": 8.014197202348432e-07, "loss": 9.989924728870392e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 449.0625, "completions/min_length": 416.0, "epoch": 7.294117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.008964497596025467, "kl": 0.007360185030847788, "learning_rate": 8.013173181896282e-07, "loss": 7.324000762309879e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 448.5, "completions/min_length": 374.0, "epoch": 7.295588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.1860389709472656, "kl": 0.011454665334895253, "learning_rate": 8.012148962943908e-07, "loss": 0.00011448182340245694, "reward": 0.6273333430290222, "reward_std": 0.04703797399997711, "rewards/DrugCombAccuracyCOTORM/mean": 0.5550000071525574, "rewards/DrugCombAccuracyCOTORM/std": 0.4665619134902954, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 4961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 451.375, "completions/min_length": 403.0, "epoch": 7.297058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.9538271427154541, "kl": 0.008007814758457243, "learning_rate": 8.011124545558785e-07, "loss": 8.059293031692505e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/mean_length": 519.6875, "completions/min_length": 418.0, "epoch": 7.298529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.4787507057189941, "kl": 0.012890542042441666, "learning_rate": 8.010099929808396e-07, "loss": 0.00012819841504096985, "reward": 0.8489000201225281, "reward_std": 0.2444196492433548, "rewards/DrugCombAccuracyCOTORM/mean": 0.8142499923706055, "rewards/DrugCombAccuracyCOTORM/std": 0.2922719120979309, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9750000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.06831300258636475, "step": 4963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 494.0, "completions/min_length": 454.0, "epoch": 7.3, "frac_reward_zero_std": 0.5, "grad_norm": 1.263461947441101, "kl": 0.010297772008925676, "learning_rate": 8.009075115760242e-07, "loss": 0.00010330229997634888, "reward": 0.8964166641235352, "reward_std": 0.19718943536281586, "rewards/DrugCombAccuracyCOTORM/mean": 0.8887500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.30663496255874634, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5013870000839233, "step": 4964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 428.75, "completions/min_length": 373.0, "epoch": 7.301470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.009431482292711735, "kl": 0.008547529694624245, "learning_rate": 8.008050103481834e-07, "loss": 8.578092092648149e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 462.1875, "completions/min_length": 422.0, "epoch": 7.302941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.2668553590774536, "kl": 0.01005481299944222, "learning_rate": 8.007024893040697e-07, "loss": 0.00010026917152572423, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 4966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 492.0, "completions/min_length": 456.0, "epoch": 7.304411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.9604610800743103, "kl": 0.014113422366790473, "learning_rate": 8.00599948450437e-07, "loss": 0.00014026110875420272, "reward": 0.6151666641235352, "reward_std": 0.16651105880737305, "rewards/DrugCombAccuracyCOTORM/mean": 0.5762500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.49902406334877014, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5416666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.7781745791435242, "step": 4967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 420.75, "completions/min_length": 363.0, "epoch": 7.305882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.012342782691121101, "kl": 0.00848684785887599, "learning_rate": 8.004973877940404e-07, "loss": 8.48935087560676e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/mean_length": 486.3125, "completions/min_length": 419.0, "epoch": 7.307352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.8527756929397583, "kl": 0.010696013574488461, "learning_rate": 8.003948073416364e-07, "loss": 0.00010542571544647217, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 423.6875, "completions/min_length": 346.0, "epoch": 7.3088235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.01447239238768816, "kl": 0.008464267011731863, "learning_rate": 8.002922070999827e-07, "loss": 8.498874376527965e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 428.0, "completions/min_length": 385.0, "epoch": 7.310294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.14190934598445892, "kl": 0.017722228076308966, "learning_rate": 8.001895870758384e-07, "loss": 0.00017485275748185813, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 458.5625, "completions/min_length": 345.0, "epoch": 7.311764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.334912657737732, "kl": 0.009048681240528822, "learning_rate": 8.000869472759637e-07, "loss": 9.055435657501221e-05, "reward": 0.7749999761581421, "reward_std": 0.40316134691238403, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 4972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 431.875, "completions/min_length": 357.0, "epoch": 7.313235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.011346034705638885, "kl": 0.008713380433619022, "learning_rate": 7.999842877071203e-07, "loss": 8.702222839929163e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/mean_length": 537.6875, "completions/min_length": 468.0, "epoch": 7.314705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.8576610684394836, "kl": 0.007362705538980663, "learning_rate": 7.998816083760713e-07, "loss": 7.418698078254238e-05, "reward": 0.46145835518836975, "reward_std": 0.0733194574713707, "rewards/DrugCombAccuracyCOTORM/mean": 0.3958333432674408, "rewards/DrugCombAccuracyCOTORM/std": 0.4254627227783203, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4479166567325592, "rewards/DrugCombCoverageCOTORM/std": 0.4663441479206085, "step": 4974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 448.25, "completions/min_length": 392.0, "epoch": 7.3161764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.016651004552841187, "kl": 0.0076759441290050745, "learning_rate": 7.997789092895808e-07, "loss": 7.611473847646266e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/mean_length": 597.5625, "completions/min_length": 505.0, "epoch": 7.317647058823529, "frac_reward_zero_std": 0.0, "grad_norm": 1.421898365020752, "kl": 0.01830387767404318, "learning_rate": 7.996761904544145e-07, "loss": 0.0001914501190185547, "reward": 0.7694532871246338, "reward_std": 0.3119221031665802, "rewards/DrugCombAccuracyCOTORM/mean": 0.733301043510437, "rewards/DrugCombAccuracyCOTORM/std": 0.3362129330635071, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.828125, "rewards/DrugCombCoverageCOTORM/std": 0.4929773211479187, "step": 4976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 463.9375, "completions/min_length": 398.0, "epoch": 7.319117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.0604017972946167, "kl": 0.012669503455981612, "learning_rate": 7.995734518773389e-07, "loss": 0.00012669044372159988, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 4977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 433.0, "completions/min_length": 306.0, "epoch": 7.320588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.9071323871612549, "kl": 0.012207717634737492, "learning_rate": 7.994706935651226e-07, "loss": 0.00012140348553657532, "reward": 0.8296874761581421, "reward_std": 0.23508523404598236, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 4978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/mean_length": 507.875, "completions/min_length": 440.0, "epoch": 7.322058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.1133983135223389, "kl": 0.009945858037099242, "learning_rate": 7.99367915524535e-07, "loss": 0.00010006129741668701, "reward": 0.37070000171661377, "reward_std": 0.2642127275466919, "rewards/DrugCombAccuracyCOTORM/mean": 0.26649999618530273, "rewards/DrugCombAccuracyCOTORM/std": 0.43958255648612976, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.574999988079071, "rewards/DrugCombCoverageCOTORM/std": 0.6547264456748962, "step": 4979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 479.375, "completions/min_length": 396.0, "epoch": 7.323529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.0904920101165771, "kl": 0.01780112087726593, "learning_rate": 7.992651177623465e-07, "loss": 0.00018025068857241422, "reward": 0.8158749938011169, "reward_std": 0.16298134624958038, "rewards/DrugCombAccuracyCOTORM/mean": 0.784166693687439, "rewards/DrugCombAccuracyCOTORM/std": 0.34634920954704285, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8854166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.17969882488250732, "step": 4980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/mean_length": 512.875, "completions/min_length": 400.0, "epoch": 7.325, "frac_reward_zero_std": 0.5, "grad_norm": 0.801175594329834, "kl": 0.009229708230122924, "learning_rate": 7.991623002853294e-07, "loss": 9.21886385185644e-05, "reward": 0.3362500071525574, "reward_std": 0.15104754269123077, "rewards/DrugCombAccuracyCOTORM/mean": 0.19375000894069672, "rewards/DrugCombAccuracyCOTORM/std": 0.3307945132255554, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.2713136672973633, "step": 4981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 455.0, "completions/min_length": 407.0, "epoch": 7.326470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.2075427770614624, "kl": 0.0174246629467234, "learning_rate": 7.990594631002572e-07, "loss": 0.00017193704843521118, "reward": 0.7336249947547913, "reward_std": 0.17558589577674866, "rewards/DrugCombAccuracyCOTORM/mean": 0.6943749785423279, "rewards/DrugCombAccuracyCOTORM/std": 0.4112161695957184, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.78125, "rewards/DrugCombCoverageCOTORM/std": 0.4989572763442993, "step": 4982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 422.0625, "completions/min_length": 385.0, "epoch": 7.327941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.210089921951294, "kl": 0.0123280807165429, "learning_rate": 7.989566062139043e-07, "loss": 0.00012365728616714478, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 480.875, "completions/min_length": 403.0, "epoch": 7.329411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.3062975406646729, "kl": 0.010541296331211925, "learning_rate": 7.988537296330468e-07, "loss": 0.00010537623893469572, "reward": 0.960812509059906, "reward_std": 0.11083897948265076, "rewards/DrugCombAccuracyCOTORM/mean": 0.9529687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.18812499940395355, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.0625, "step": 4984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 471.25, "completions/min_length": 405.0, "epoch": 7.330882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.016894806176424026, "kl": 0.009826651308685541, "learning_rate": 7.987508333644619e-07, "loss": 9.81910852715373e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 457.9375, "completions/min_length": 382.0, "epoch": 7.33235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01922813057899475, "kl": 0.008705006795935333, "learning_rate": 7.986479174149281e-07, "loss": 8.747465471969917e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/mean_length": 496.6875, "completions/min_length": 429.0, "epoch": 7.333823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.9027718305587769, "kl": 0.01324700377881527, "learning_rate": 7.985449817912252e-07, "loss": 0.00013130903244018555, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/mean_length": 530.5625, "completions/min_length": 455.0, "epoch": 7.3352941176470585, "frac_reward_zero_std": 0.0, "grad_norm": 1.3773105144500732, "kl": 0.012249273946508765, "learning_rate": 7.984420265001346e-07, "loss": 0.00012193620204925537, "reward": 0.5562499761581421, "reward_std": 0.4307469427585602, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.7861651182174683, "step": 4988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 415.25, "completions/min_length": 365.0, "epoch": 7.336764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.018576426431536674, "kl": 0.008096026838757098, "learning_rate": 7.983390515484384e-07, "loss": 8.031089964788407e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 430.5, "completions/min_length": 381.0, "epoch": 7.338235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1534337997436523, "kl": 0.009342832840047777, "learning_rate": 7.982360569429205e-07, "loss": 9.3117356300354e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 457.0, "completions/min_length": 400.0, "epoch": 7.339705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9429790377616882, "kl": 0.008658372680656612, "learning_rate": 7.981330426903661e-07, "loss": 8.66465998115018e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 450.3125, "completions/min_length": 415.0, "epoch": 7.341176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.7609527111053467, "kl": 0.00669635453959927, "learning_rate": 7.980300087975611e-07, "loss": 6.683170795440674e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 496.875, "completions/min_length": 449.0, "epoch": 7.3426470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.029699906706809998, "kl": 0.012494683032855392, "learning_rate": 7.979269552712935e-07, "loss": 0.00012476539995986968, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 468.625, "completions/min_length": 421.0, "epoch": 7.344117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.1026525497436523, "kl": 0.009085614117793739, "learning_rate": 7.978238821183519e-07, "loss": 9.151366248261184e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 466.0, "completions/min_length": 367.0, "epoch": 7.345588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9461535215377808, "kl": 0.00836700713261962, "learning_rate": 7.977207893455267e-07, "loss": 8.34539532661438e-05, "reward": 0.893750011920929, "reward_std": 0.18212535977363586, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 4995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 446.8125, "completions/min_length": 397.0, "epoch": 7.347058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.020233359187841415, "kl": 0.008825112716294825, "learning_rate": 7.976176769596094e-07, "loss": 8.813809836283326e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 408.1875, "completions/min_length": 351.0, "epoch": 7.348529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.020212357863783836, "kl": 0.008609091164544225, "learning_rate": 7.975145449673926e-07, "loss": 8.63988752826117e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 484.125, "completions/min_length": 440.0, "epoch": 7.35, "frac_reward_zero_std": 0.0, "grad_norm": 1.5369690656661987, "kl": 0.01852134382352233, "learning_rate": 7.974113933756707e-07, "loss": 0.00018446892499923706, "reward": 0.625, "reward_std": 0.405225932598114, "rewards/DrugCombAccuracyCOTORM/mean": 0.53125, "rewards/DrugCombAccuracyCOTORM/std": 0.4989572763442993, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 454.6875, "completions/min_length": 393.0, "epoch": 7.351470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.01140467170625925, "kl": 0.00944750674534589, "learning_rate": 7.973082221912386e-07, "loss": 9.397246321896091e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 4999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/mean_length": 484.8125, "completions/min_length": 383.0, "epoch": 7.352941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.016980662941932678, "kl": 0.007940770359709859, "learning_rate": 7.972050314208933e-07, "loss": 7.978337089298293e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 470.5625, "completions/min_length": 421.0, "epoch": 7.354411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.009519217535853386, "kl": 0.006196693517267704, "learning_rate": 7.971018210714328e-07, "loss": 6.208203558344394e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 497.4375, "completions/min_length": 437.0, "epoch": 7.355882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0366862416267395, "kl": 0.01196897542104125, "learning_rate": 7.96998591149656e-07, "loss": 0.0001203539504786022, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 471.125, "completions/min_length": 428.0, "epoch": 7.357352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.025938620790839195, "kl": 0.01010404247790575, "learning_rate": 7.968953416623639e-07, "loss": 0.00010021769412560388, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 475.9375, "completions/min_length": 404.0, "epoch": 7.358823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.0700485706329346, "kl": 0.010083723231218755, "learning_rate": 7.967920726163577e-07, "loss": 0.00010044054943136871, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 5004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 476.3125, "completions/min_length": 430.0, "epoch": 7.360294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.4741708040237427, "kl": 0.014559628441929817, "learning_rate": 7.966887840184411e-07, "loss": 0.00014686957001686096, "reward": 0.39328888058662415, "reward_std": 0.26751047372817993, "rewards/DrugCombAccuracyCOTORM/mean": 0.3804999887943268, "rewards/DrugCombAccuracyCOTORM/std": 0.4960685670375824, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.1111111044883728, "rewards/DrugCombCoverageCOTORM/std": 0.9609667062759399, "step": 5005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 417.625, "completions/min_length": 341.0, "epoch": 7.3617647058823525, "frac_reward_zero_std": 1.0, "grad_norm": 0.01921262964606285, "kl": 0.008467131527140737, "learning_rate": 7.965854758754181e-07, "loss": 8.39738204376772e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 363.4375, "completions/min_length": 328.0, "epoch": 7.363235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9705615639686584, "kl": 0.009213970275595784, "learning_rate": 7.964821481940947e-07, "loss": 9.162724018096924e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 5007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 429.4375, "completions/min_length": 344.0, "epoch": 7.364705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.011945705860853195, "kl": 0.009003696031868458, "learning_rate": 7.963788009812774e-07, "loss": 9.101523755816743e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 450.5625, "completions/min_length": 395.0, "epoch": 7.366176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9267370104789734, "kl": 0.01206875266507268, "learning_rate": 7.96275434243775e-07, "loss": 0.00011990594794042408, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 415.125, "completions/min_length": 339.0, "epoch": 7.367647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.019700290635228157, "kl": 0.009194617043249309, "learning_rate": 7.961720479883965e-07, "loss": 9.249780123354867e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 463.8125, "completions/min_length": 415.0, "epoch": 7.3691176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.010759233497083187, "kl": 0.008617528364993632, "learning_rate": 7.960686422219532e-07, "loss": 8.6233951151371e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 456.875, "completions/min_length": 403.0, "epoch": 7.370588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.875274658203125, "kl": 0.011065538274124265, "learning_rate": 7.95965216951257e-07, "loss": 0.00011080580588895828, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 433.5, "completions/min_length": 374.0, "epoch": 7.372058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.8234246373176575, "kl": 0.008995402720756829, "learning_rate": 7.958617721831211e-07, "loss": 9.127175871981308e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/mean_length": 476.8125, "completions/min_length": 376.0, "epoch": 7.373529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.7034321427345276, "kl": 0.008705989457666874, "learning_rate": 7.957583079243606e-07, "loss": 8.732080459594727e-05, "reward": 0.44999998807907104, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 5014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 458.3125, "completions/min_length": 430.0, "epoch": 7.375, "frac_reward_zero_std": 0.5, "grad_norm": 1.5457651615142822, "kl": 0.009672761778347194, "learning_rate": 7.956548241817911e-07, "loss": 9.681284427642822e-05, "reward": 0.7865833044052124, "reward_std": 0.1115071028470993, "rewards/DrugCombAccuracyCOTORM/mean": 0.7462500333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.30946996808052063, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.3381595015525818, "step": 5015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 478.0625, "completions/min_length": 414.0, "epoch": 7.376470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.9863165616989136, "kl": 0.009680918999947608, "learning_rate": 7.955513209622302e-07, "loss": 9.670853614807129e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/mean_length": 473.625, "completions/min_length": 329.0, "epoch": 7.377941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.2783122062683105, "kl": 0.010693657211959362, "learning_rate": 7.954477982724961e-07, "loss": 0.0001081228256225586, "reward": 0.6187499761581421, "reward_std": 0.41806113719940186, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 5017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 450.25, "completions/min_length": 405.0, "epoch": 7.379411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.0316721200942993, "kl": 0.01026654930319637, "learning_rate": 7.953442561194086e-07, "loss": 0.00010243058204650879, "reward": 0.6365000009536743, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.5612499713897705, "rewards/DrugCombAccuracyCOTORM/std": 0.4041472375392914, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.12909944355487823, "step": 5018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 418.9375, "completions/min_length": 365.0, "epoch": 7.3808823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 1.2182903289794922, "kl": 0.006563290604390204, "learning_rate": 7.952406945097892e-07, "loss": 6.514787673950195e-05, "reward": 0.5375000238418579, "reward_std": 0.34973087906837463, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 5019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/mean_length": 534.0625, "completions/min_length": 485.0, "epoch": 7.382352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.3996678590774536, "kl": 0.01095055986661464, "learning_rate": 7.951371134504597e-07, "loss": 0.0001106560230255127, "reward": 0.43760430812835693, "reward_std": 0.25965288281440735, "rewards/DrugCombAccuracyCOTORM/mean": 0.31419286131858826, "rewards/DrugCombAccuracyCOTORM/std": 0.4196471869945526, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.862500011920929, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/mean_length": 459.875, "completions/min_length": 331.0, "epoch": 7.383823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.93402498960495, "kl": 0.008199989679269493, "learning_rate": 7.950335129482443e-07, "loss": 8.231401443481445e-05, "reward": 0.8562500476837158, "reward_std": 0.05988579988479614, "rewards/DrugCombAccuracyCOTORM/mean": 0.84375, "rewards/DrugCombAccuracyCOTORM/std": 0.18726837635040283, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 408.4375, "completions/min_length": 371.0, "epoch": 7.385294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.13157157599925995, "kl": 0.011799982283264399, "learning_rate": 7.949298930099676e-07, "loss": 0.00012116947618778795, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 489.9375, "completions/min_length": 445.0, "epoch": 7.386764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0473116636276245, "kl": 0.010475923074409366, "learning_rate": 7.948262536424559e-07, "loss": 0.00010386589565314353, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 472.8125, "completions/min_length": 350.0, "epoch": 7.3882352941176475, "frac_reward_zero_std": 1.0, "grad_norm": 0.013333184644579887, "kl": 0.008370092022232711, "learning_rate": 7.947225948525369e-07, "loss": 8.36740291561e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 443.5, "completions/min_length": 398.0, "epoch": 7.389705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.020731205120682716, "kl": 0.010743599385023117, "learning_rate": 7.946189166470389e-07, "loss": 0.0001064210882759653, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 446.3125, "completions/min_length": 377.0, "epoch": 7.391176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.01063823513686657, "kl": 0.006814510212279856, "learning_rate": 7.945152190327925e-07, "loss": 6.803217547712848e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 448.5, "completions/min_length": 406.0, "epoch": 7.392647058823529, "frac_reward_zero_std": 0.0, "grad_norm": 18.416669845581055, "kl": 0.10362306609749794, "learning_rate": 7.944115020166287e-07, "loss": 0.0010288432240486145, "reward": 0.59375, "reward_std": 0.3005203902721405, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 435.5, "completions/min_length": 368.0, "epoch": 7.394117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.021205846220254898, "kl": 0.009716696571558714, "learning_rate": 7.943077656053802e-07, "loss": 9.913391841109842e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 461.3125, "completions/min_length": 423.0, "epoch": 7.395588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9900153875350952, "kl": 0.010872096754610538, "learning_rate": 7.942040098058811e-07, "loss": 0.00010901952191488817, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 464.625, "completions/min_length": 394.0, "epoch": 7.397058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.007164269685745239, "kl": 0.006895725149661303, "learning_rate": 7.941002346249662e-07, "loss": 6.985885556787252e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 437.75, "completions/min_length": 372.0, "epoch": 7.398529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.064045786857605, "kl": 0.011325267027132213, "learning_rate": 7.939964400694721e-07, "loss": 0.00011368095874786377, "reward": 0.6678333282470703, "reward_std": 0.21822859346866608, "rewards/DrugCombAccuracyCOTORM/mean": 0.6525000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.46795299649238586, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4583333432674408, "rewards/DrugCombCoverageCOTORM/std": 0.8766518831253052, "step": 5031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 507.9375, "completions/min_length": 471.0, "epoch": 7.4, "frac_reward_zero_std": 0.0, "grad_norm": 1.3122044801712036, "kl": 0.012329187244176865, "learning_rate": 7.938926261462365e-07, "loss": 0.00012348219752311707, "reward": 0.6756666898727417, "reward_std": 0.35298052430152893, "rewards/DrugCombAccuracyCOTORM/mean": 0.6154166460037231, "rewards/DrugCombAccuracyCOTORM/std": 0.4273724853992462, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.27216553688049316, "step": 5032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 420.125, "completions/min_length": 340.0, "epoch": 7.401470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.015760092064738274, "kl": 0.008301464491523802, "learning_rate": 7.937887928620985e-07, "loss": 8.249537495430559e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/mean_length": 544.3125, "completions/min_length": 498.0, "epoch": 7.402941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 0.9076455235481262, "kl": 0.01556899631395936, "learning_rate": 7.936849402238984e-07, "loss": 0.00015529058873653412, "reward": 0.687333345413208, "reward_std": 0.16396206617355347, "rewards/DrugCombAccuracyCOTORM/mean": 0.60916668176651, "rewards/DrugCombAccuracyCOTORM/std": 0.49126818776130676, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 432.375, "completions/min_length": 356.0, "epoch": 7.404411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.011014306917786598, "kl": 0.006830049096606672, "learning_rate": 7.935810682384776e-07, "loss": 6.862020381959155e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 5035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/mean_length": 529.0, "completions/min_length": 454.0, "epoch": 7.405882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.1940629482269287, "kl": 0.009632888250052929, "learning_rate": 7.934771769126789e-07, "loss": 9.65408980846405e-05, "reward": 0.33375000953674316, "reward_std": 0.17907202243804932, "rewards/DrugCombAccuracyCOTORM/mean": 0.24642857909202576, "rewards/DrugCombAccuracyCOTORM/std": 0.3835068941116333, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3660714328289032, "rewards/DrugCombCoverageCOTORM/std": 0.4073099195957184, "step": 5036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 429.375, "completions/min_length": 388.0, "epoch": 7.4073529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.016826696693897247, "kl": 0.00949545914772898, "learning_rate": 7.933732662533465e-07, "loss": 9.541516192257404e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 5037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 428.1875, "completions/min_length": 340.0, "epoch": 7.408823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.013012859039008617, "kl": 0.008171045337803662, "learning_rate": 7.93269336267326e-07, "loss": 8.136425458360463e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/mean_length": 490.6875, "completions/min_length": 422.0, "epoch": 7.410294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9887788891792297, "kl": 0.011476657819002867, "learning_rate": 7.931653869614635e-07, "loss": 0.00011598318815231323, "reward": 0.6285714507102966, "reward_std": 0.1873553991317749, "rewards/DrugCombAccuracyCOTORM/mean": 0.5982142686843872, "rewards/DrugCombAccuracyCOTORM/std": 0.489950031042099, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 5039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 473.75, "completions/min_length": 419.0, "epoch": 7.411764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0795376300811768, "kl": 0.009556515840813518, "learning_rate": 7.930614183426073e-07, "loss": 9.58126038312912e-05, "reward": 0.887499988079071, "reward_std": 0.21001701056957245, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 444.6875, "completions/min_length": 410.0, "epoch": 7.413235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.907036542892456, "kl": 0.009419159032404423, "learning_rate": 7.929574304176065e-07, "loss": 9.396735549671575e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/mean_length": 501.0, "completions/min_length": 392.0, "epoch": 7.4147058823529415, "frac_reward_zero_std": 1.0, "grad_norm": 0.05783586576581001, "kl": 0.011730459984391928, "learning_rate": 7.928534231933118e-07, "loss": 0.00011669559171423316, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 449.375, "completions/min_length": 363.0, "epoch": 7.416176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.996756374835968, "kl": 0.012722976272925735, "learning_rate": 7.927493966765743e-07, "loss": 0.0001278121053474024, "reward": 0.762499988079071, "reward_std": 0.25599944591522217, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 5043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 433.6875, "completions/min_length": 360.0, "epoch": 7.41764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.010952089913189411, "kl": 0.008425934705883265, "learning_rate": 7.926453508742477e-07, "loss": 8.400999649893492e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 488.75, "completions/min_length": 410.0, "epoch": 7.419117647058823, "frac_reward_zero_std": 0.0, "grad_norm": 1.4544841051101685, "kl": 0.010835506720468402, "learning_rate": 7.925412857931859e-07, "loss": 0.00010821223258972168, "reward": 0.7901666760444641, "reward_std": 0.3490983247756958, "rewards/DrugCombAccuracyCOTORM/mean": 0.7637500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.42547035217285156, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.4013864994049072, "step": 5045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 418.3125, "completions/min_length": 352.0, "epoch": 7.420588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.063227653503418, "kl": 0.011960315983742476, "learning_rate": 7.924372014402444e-07, "loss": 0.00011839604849228635, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 432.9375, "completions/min_length": 373.0, "epoch": 7.422058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.010007142089307308, "kl": 0.008322521927766502, "learning_rate": 7.923330978222802e-07, "loss": 8.218858420150355e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 5047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 465.375, "completions/min_length": 413.0, "epoch": 7.423529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.017276016995310783, "kl": 0.011711092665791512, "learning_rate": 7.922289749461515e-07, "loss": 0.00011705751967383549, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 452.875, "completions/min_length": 413.0, "epoch": 7.425, "frac_reward_zero_std": 0.5, "grad_norm": 0.9403196573257446, "kl": 0.009319268516264856, "learning_rate": 7.921248328187173e-07, "loss": 9.249782306142151e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 471.1875, "completions/min_length": 410.0, "epoch": 7.426470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.01594632677733898, "kl": 0.00855896552093327, "learning_rate": 7.920206714468383e-07, "loss": 8.513175998814404e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 443.875, "completions/min_length": 389.0, "epoch": 7.427941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.018453221768140793, "kl": 0.00676942290738225, "learning_rate": 7.919164908373765e-07, "loss": 6.858989945612848e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 486.8125, "completions/min_length": 413.0, "epoch": 7.429411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.016103876754641533, "kl": 0.008980001090094447, "learning_rate": 7.918122909971951e-07, "loss": 9.009390487335622e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/mean_length": 499.125, "completions/min_length": 426.0, "epoch": 7.430882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.9131090044975281, "kl": 0.009495443664491177, "learning_rate": 7.917080719331585e-07, "loss": 9.380094707012177e-05, "reward": 0.8345833420753479, "reward_std": 0.10616230964660645, "rewards/DrugCombAccuracyCOTORM/mean": 0.8166666626930237, "rewards/DrugCombAccuracyCOTORM/std": 0.24944382905960083, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 497.5625, "completions/min_length": 432.0, "epoch": 7.432352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.0104769468307495, "kl": 0.011476276908069849, "learning_rate": 7.916038336521325e-07, "loss": 0.00011491775512695312, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 486.375, "completions/min_length": 453.0, "epoch": 7.4338235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.018305381760001183, "kl": 0.009260857477784157, "learning_rate": 7.914995761609836e-07, "loss": 9.283460531150922e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 5055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 410.8125, "completions/min_length": 366.0, "epoch": 7.435294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.0333564355969429, "kl": 0.01095177698880434, "learning_rate": 7.913952994665804e-07, "loss": 0.00011024849663954228, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 437.0, "completions/min_length": 391.0, "epoch": 7.436764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9878144264221191, "kl": 0.009179860237054527, "learning_rate": 7.912910035757925e-07, "loss": 9.185075759887695e-05, "reward": 0.5, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 462.1875, "completions/min_length": 398.0, "epoch": 7.438235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1490167379379272, "kl": 0.01051489997189492, "learning_rate": 7.911866884954902e-07, "loss": 0.00010441988706588745, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 458.5, "completions/min_length": 418.0, "epoch": 7.439705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.009792500175535679, "kl": 0.007790099363774061, "learning_rate": 7.910823542325458e-07, "loss": 7.782242028042674e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 441.5, "completions/min_length": 395.0, "epoch": 7.4411764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.011298799887299538, "kl": 0.007932574837468565, "learning_rate": 7.909780007938326e-07, "loss": 7.971390732564032e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 468.3125, "completions/min_length": 390.0, "epoch": 7.442647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.0854432582855225, "kl": 0.01884400541894138, "learning_rate": 7.90873628186225e-07, "loss": 0.00019501303904689848, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 472.3125, "completions/min_length": 406.0, "epoch": 7.444117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.014205103740096092, "kl": 0.009693826548755169, "learning_rate": 7.90769236416599e-07, "loss": 9.703764226287603e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 452.75, "completions/min_length": 372.0, "epoch": 7.445588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.04590541124343872, "kl": 0.011694576358422637, "learning_rate": 7.906648254918316e-07, "loss": 0.00011837493366329, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 452.625, "completions/min_length": 394.0, "epoch": 7.447058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.01323176734149456, "kl": 0.010933178011327982, "learning_rate": 7.905603954188009e-07, "loss": 0.00010925861715804785, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/mean_length": 542.5625, "completions/min_length": 448.0, "epoch": 7.448529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8282454609870911, "kl": 0.008880258770659566, "learning_rate": 7.904559462043867e-07, "loss": 8.866809366736561e-05, "reward": 0.9295833110809326, "reward_std": 0.1059415340423584, "rewards/DrugCombAccuracyCOTORM/mean": 0.925000011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.17320507764816284, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.23471809923648834, "step": 5065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 474.3125, "completions/min_length": 400.0, "epoch": 7.45, "frac_reward_zero_std": 1.0, "grad_norm": 0.023057762533426285, "kl": 0.00970664364285767, "learning_rate": 7.903514778554699e-07, "loss": 9.66174848144874e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 454.0, "completions/min_length": 374.0, "epoch": 7.451470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8559519052505493, "kl": 0.011537105543538928, "learning_rate": 7.902469903789325e-07, "loss": 0.00011669484956655651, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 460.375, "completions/min_length": 431.0, "epoch": 7.452941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.8681007027626038, "kl": 0.010933486046269536, "learning_rate": 7.901424837816578e-07, "loss": 0.00010943040251731873, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 5068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 443.75, "completions/min_length": 379.0, "epoch": 7.454411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.01869523525238037, "kl": 0.010165169253014028, "learning_rate": 7.900379580705307e-07, "loss": 0.00010191773617407307, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 413.375, "completions/min_length": 372.0, "epoch": 7.455882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.024583712220191956, "kl": 0.009124101372435689, "learning_rate": 7.899334132524368e-07, "loss": 9.153192513622344e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/mean_length": 559.1875, "completions/min_length": 478.0, "epoch": 7.45735294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.3361667394638062, "kl": 0.01244771876372397, "learning_rate": 7.898288493342633e-07, "loss": 0.00012620165944099426, "reward": 0.6381666660308838, "reward_std": 0.07987749576568604, "rewards/DrugCombAccuracyCOTORM/mean": 0.5685417056083679, "rewards/DrugCombAccuracyCOTORM/std": 0.4143356680870056, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 5071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 470.1875, "completions/min_length": 375.0, "epoch": 7.458823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 1.5057708024978638, "kl": 0.011997871100902557, "learning_rate": 7.897242663228989e-07, "loss": 0.00011916086077690125, "reward": 0.6445833444595337, "reward_std": 0.37712085247039795, "rewards/DrugCombAccuracyCOTORM/mean": 0.5687500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.45213383436203003, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.15957117080688477, "step": 5072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 422.75, "completions/min_length": 357.0, "epoch": 7.4602941176470585, "frac_reward_zero_std": 0.5, "grad_norm": 1.0379176139831543, "kl": 0.01199690205976367, "learning_rate": 7.896196642252328e-07, "loss": 0.00011997558613074943, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 454.4375, "completions/min_length": 417.0, "epoch": 7.461764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.015938453376293182, "kl": 0.009450608980841935, "learning_rate": 7.895150430481563e-07, "loss": 9.507108188699931e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/mean_length": 503.25, "completions/min_length": 413.0, "epoch": 7.463235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.012847496196627617, "kl": 0.008941216510720551, "learning_rate": 7.894104027985614e-07, "loss": 8.938898827182129e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 5075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 462.5, "completions/min_length": 375.0, "epoch": 7.464705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.1287368535995483, "kl": 0.009193316102027893, "learning_rate": 7.893057434833415e-07, "loss": 9.210921416524798e-05, "reward": 0.9354166984558105, "reward_std": 0.09005618840456009, "rewards/DrugCombAccuracyCOTORM/mean": 0.9270833730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.16065549850463867, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.17078252136707306, "step": 5076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/mean_length": 473.25, "completions/min_length": 370.0, "epoch": 7.466176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.010512116365134716, "kl": 0.008083413355052471, "learning_rate": 7.892010651093914e-07, "loss": 8.07906617410481e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 451.1875, "completions/min_length": 380.0, "epoch": 7.4676470588235295, "frac_reward_zero_std": 0.0, "grad_norm": 1.6145179271697998, "kl": 0.014700864674523473, "learning_rate": 7.890963676836071e-07, "loss": 0.00014601647853851318, "reward": 0.6227707862854004, "reward_std": 0.2217184603214264, "rewards/DrugCombAccuracyCOTORM/mean": 0.5727343559265137, "rewards/DrugCombAccuracyCOTORM/std": 0.4065685570240021, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6458333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5230785608291626, "step": 5078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 453.3125, "completions/min_length": 399.0, "epoch": 7.469117647058823, "frac_reward_zero_std": 0.0, "grad_norm": 1.4488275051116943, "kl": 0.009774613892659545, "learning_rate": 7.889916512128856e-07, "loss": 9.764358401298523e-05, "reward": 0.75, "reward_std": 0.35523033142089844, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 464.875, "completions/min_length": 403.0, "epoch": 7.470588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.1580561399459839, "kl": 0.012637918116524816, "learning_rate": 7.888869157041256e-07, "loss": 0.00012560933828353882, "reward": 0.6026666760444641, "reward_std": 0.02894548512995243, "rewards/DrugCombAccuracyCOTORM/mean": 0.5137500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.5050000548362732, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.14907118678092957, "step": 5080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/mean_length": 497.375, "completions/min_length": 344.0, "epoch": 7.472058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 1.2176265716552734, "kl": 0.01325824111700058, "learning_rate": 7.887821611642266e-07, "loss": 0.00013183057308197021, "reward": 0.11249999701976776, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.0625, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.375, "rewards/DrugCombCoverageCOTORM/std": 0.6191391944885254, "step": 5081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 483.0625, "completions/min_length": 398.0, "epoch": 7.473529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.451542615890503, "kl": 0.01114654098637402, "learning_rate": 7.886773876000896e-07, "loss": 0.0001140981912612915, "reward": 0.84375, "reward_std": 0.3442630469799042, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 513.9375, "completions/min_length": 458.0, "epoch": 7.475, "frac_reward_zero_std": 0.0, "grad_norm": 1.252852201461792, "kl": 0.01170573488343507, "learning_rate": 7.885725950186168e-07, "loss": 0.00011740624904632568, "reward": 0.374916672706604, "reward_std": 0.32451072335243225, "rewards/DrugCombAccuracyCOTORM/mean": 0.2837499976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.3697364330291748, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.40311291813850403, "step": 5083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 430.375, "completions/min_length": 332.0, "epoch": 7.476470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9297052621841431, "kl": 0.011564347427338362, "learning_rate": 7.884677834267119e-07, "loss": 0.00011561065912246704, "reward": 0.22499999403953552, "reward_std": 0.14880476891994476, "rewards/DrugCombAccuracyCOTORM/mean": 0.15625, "rewards/DrugCombAccuracyCOTORM/std": 0.3010398745536804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 5084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 473.0625, "completions/min_length": 419.0, "epoch": 7.477941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.02048318088054657, "kl": 0.010787218110635877, "learning_rate": 7.883629528312793e-07, "loss": 0.00010818673035828397, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 426.0, "completions/min_length": 374.0, "epoch": 7.479411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.026727810502052307, "kl": 0.013147527119144797, "learning_rate": 7.882581032392252e-07, "loss": 0.0001302040764130652, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/mean_length": 521.625, "completions/min_length": 425.0, "epoch": 7.480882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 1.5701967477798462, "kl": 0.012796364724636078, "learning_rate": 7.881532346574566e-07, "loss": 0.00012770295143127441, "reward": 0.4750000238418579, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 5087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/mean_length": 415.375, "completions/min_length": 359.0, "epoch": 7.482352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.008735911920666695, "kl": 0.007869396824389696, "learning_rate": 7.880483470928823e-07, "loss": 7.795784040354192e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/mean_length": 505.4375, "completions/min_length": 392.0, "epoch": 7.483823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.7116569876670837, "kl": 0.009891933528706431, "learning_rate": 7.879434405524118e-07, "loss": 9.998679161071777e-05, "reward": 0.850595235824585, "reward_std": 0.1318598836660385, "rewards/DrugCombAccuracyCOTORM/mean": 0.8288690447807312, "rewards/DrugCombAccuracyCOTORM/std": 0.26898354291915894, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.23959842324256897, "step": 5089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 484.1875, "completions/min_length": 394.0, "epoch": 7.485294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.011693863198161125, "kl": 0.009762370260432363, "learning_rate": 7.878385150429561e-07, "loss": 9.756387589732185e-05, "reward": 0.8416666984558105, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.8333333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.17213258147239685, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 5090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 467.1875, "completions/min_length": 400.0, "epoch": 7.4867647058823525, "frac_reward_zero_std": 0.5, "grad_norm": 1.260708212852478, "kl": 0.01001653983257711, "learning_rate": 7.877335705714274e-07, "loss": 9.968600352294743e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/mean_length": 434.625, "completions/min_length": 329.0, "epoch": 7.488235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9810270667076111, "kl": 0.011320805409923196, "learning_rate": 7.876286071447393e-07, "loss": 0.00011333823204040527, "reward": 0.8552857041358948, "reward_std": 0.16301329433918, "rewards/DrugCombAccuracyCOTORM/mean": 0.826919674873352, "rewards/DrugCombAccuracyCOTORM/std": 0.3193928897380829, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.13437095284461975, "step": 5092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/mean_length": 414.6875, "completions/min_length": 385.0, "epoch": 7.489705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.018490076065063477, "kl": 0.010707376757636666, "learning_rate": 7.875236247698065e-07, "loss": 0.00010702699364628643, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 423.75, "completions/min_length": 371.0, "epoch": 7.491176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.0762056112289429, "kl": 0.010875605861656368, "learning_rate": 7.87418623453545e-07, "loss": 0.00010844320058822632, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 5094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 467.125, "completions/min_length": 433.0, "epoch": 7.492647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.01309945248067379, "kl": 0.00857548019848764, "learning_rate": 7.873136032028719e-07, "loss": 8.554788655601442e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 506.3125, "completions/min_length": 440.0, "epoch": 7.4941176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.3629167079925537, "kl": 0.010166356107220054, "learning_rate": 7.872085640247057e-07, "loss": 0.0001015588641166687, "reward": 0.5595166683197021, "reward_std": 0.1828179508447647, "rewards/DrugCombAccuracyCOTORM/mean": 0.4801250100135803, "rewards/DrugCombAccuracyCOTORM/std": 0.4783272445201874, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7541666626930237, "rewards/DrugCombCoverageCOTORM/std": 0.31666669249534607, "step": 5096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/mean_length": 417.125, "completions/min_length": 386.0, "epoch": 7.495588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.013798308558762074, "kl": 0.009335792157799006, "learning_rate": 7.871035059259661e-07, "loss": 9.300414967583492e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 467.75, "completions/min_length": 395.0, "epoch": 7.497058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.026111651211977005, "kl": 0.012173832394182682, "learning_rate": 7.86998428913574e-07, "loss": 0.00012188527034595609, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/mean_length": 514.25, "completions/min_length": 374.0, "epoch": 7.498529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.7911834120750427, "kl": 0.01125174481421709, "learning_rate": 7.868933329944518e-07, "loss": 0.00011283159255981445, "reward": 0.7441666722297668, "reward_std": 0.1262820065021515, "rewards/DrugCombAccuracyCOTORM/mean": 0.690625011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.38549479842185974, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.08606630563735962, "step": 5099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 458.0, "completions/min_length": 413.0, "epoch": 7.5, "frac_reward_zero_std": 0.5, "grad_norm": 1.08965003490448, "kl": 0.011419278685934842, "learning_rate": 7.86788218175523e-07, "loss": 0.00011344254016876221, "reward": 0.5213333368301392, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.42250001430511475, "rewards/DrugCombAccuracyCOTORM/std": 0.410779744386673, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 5100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/mean_length": 489.4375, "completions/min_length": 391.0, "epoch": 7.501470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.9330285787582397, "kl": 0.011412424966692924, "learning_rate": 7.86683084463712e-07, "loss": 0.00011396408081054688, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 449.9375, "completions/min_length": 395.0, "epoch": 7.502941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.47469961643219, "kl": 0.009900641394779086, "learning_rate": 7.865779318659449e-07, "loss": 9.882450103759766e-05, "reward": 0.824999988079071, "reward_std": 0.37287637591362, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 5102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 462.8125, "completions/min_length": 377.0, "epoch": 7.504411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.8704026341438293, "kl": 0.00954350910615176, "learning_rate": 7.86472760389149e-07, "loss": 9.672810847405344e-05, "reward": 0.9725377559661865, "reward_std": 0.05707345902919769, "rewards/DrugCombAccuracyCOTORM/mean": 0.9695784449577332, "rewards/DrugCombAccuracyCOTORM/std": 0.0893825963139534, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 5103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 433.0, "completions/min_length": 335.0, "epoch": 7.5058823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 1.3491564989089966, "kl": 0.01129363663494587, "learning_rate": 7.863675700402526e-07, "loss": 0.00011152029037475586, "reward": 0.8937499523162842, "reward_std": 0.3005203604698181, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 496.3125, "completions/min_length": 440.0, "epoch": 7.507352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.2960022687911987, "kl": 0.011259109014645219, "learning_rate": 7.862623608261854e-07, "loss": 0.00011174008250236511, "reward": 0.375, "reward_std": 0.20448634028434753, "rewards/DrugCombAccuracyCOTORM/mean": 0.2291666716337204, "rewards/DrugCombAccuracyCOTORM/std": 0.4166666567325592, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 5105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 442.3125, "completions/min_length": 400.0, "epoch": 7.508823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.01783892512321472, "kl": 0.009983908385038376, "learning_rate": 7.861571327538782e-07, "loss": 9.99583353404887e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 429.1875, "completions/min_length": 364.0, "epoch": 7.510294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.7652566432952881, "kl": 0.009594375267624855, "learning_rate": 7.860518858302633e-07, "loss": 9.568780660629272e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 447.25, "completions/min_length": 397.0, "epoch": 7.511764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.01635688729584217, "kl": 0.011552768759429455, "learning_rate": 7.85946620062274e-07, "loss": 0.00011497373634483665, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 515.5, "completions/min_length": 470.0, "epoch": 7.5132352941176475, "frac_reward_zero_std": 1.0, "grad_norm": 0.010036463849246502, "kl": 0.006829375517554581, "learning_rate": 7.858413354568452e-07, "loss": 6.84528422425501e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 493.4375, "completions/min_length": 422.0, "epoch": 7.514705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.01544069405645132, "kl": 0.010027644224464893, "learning_rate": 7.857360320209124e-07, "loss": 9.972327097784728e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 442.6875, "completions/min_length": 378.0, "epoch": 7.516176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.09412495791912079, "kl": 0.012319735833443701, "learning_rate": 7.85630709761413e-07, "loss": 0.00012424886517692357, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 5111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/mean_length": 413.875, "completions/min_length": 359.0, "epoch": 7.517647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.3996598720550537, "kl": 0.014887862838804722, "learning_rate": 7.855253686852854e-07, "loss": 0.0001494153548264876, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 438.0625, "completions/min_length": 356.0, "epoch": 7.519117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.012306810356676579, "kl": 0.009182941284961998, "learning_rate": 7.85420008799469e-07, "loss": 9.121844777837396e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 472.1875, "completions/min_length": 414.0, "epoch": 7.520588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.0216519832611084, "kl": 0.009020316763781011, "learning_rate": 7.853146301109046e-07, "loss": 8.984468877315521e-05, "reward": 0.5912500023841858, "reward_std": 0.08309976756572723, "rewards/DrugCombAccuracyCOTORM/mean": 0.551562488079071, "rewards/DrugCombAccuracyCOTORM/std": 0.4697107970714569, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.7691987156867981, "step": 5114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 460.9375, "completions/min_length": 420.0, "epoch": 7.522058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.02315954491496086, "kl": 0.010135075310245156, "learning_rate": 7.852092326265345e-07, "loss": 0.00010031723650172353, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/mean_length": 475.5, "completions/min_length": 381.0, "epoch": 7.523529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.010231692343950272, "kl": 0.008274823427200317, "learning_rate": 7.851038163533019e-07, "loss": 8.26375326141715e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 424.0, "completions/min_length": 333.0, "epoch": 7.525, "frac_reward_zero_std": 0.5, "grad_norm": 1.0122573375701904, "kl": 0.010039438493549824, "learning_rate": 7.849983812981515e-07, "loss": 0.00010131445742445067, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 512.4375, "completions/min_length": 438.0, "epoch": 7.526470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.3264658451080322, "kl": 0.015821882057935, "learning_rate": 7.84892927468029e-07, "loss": 0.00015811622142791748, "reward": 0.5255208611488342, "reward_std": 0.4431156814098358, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7552083730697632, "rewards/DrugCombCoverageCOTORM/std": 0.505038321018219, "step": 5118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 466.875, "completions/min_length": 388.0, "epoch": 7.527941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 0.9033525586128235, "kl": 0.009482597233727574, "learning_rate": 7.847874548698811e-07, "loss": 9.3899667263031e-05, "reward": 0.5625, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 5119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 494.375, "completions/min_length": 430.0, "epoch": 7.529411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.6847625970840454, "kl": 0.009899048367515206, "learning_rate": 7.846819635106568e-07, "loss": 9.849667549133301e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 460.625, "completions/min_length": 379.0, "epoch": 7.530882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.0978073924779892, "kl": 0.010718692908994853, "learning_rate": 7.845764533973049e-07, "loss": 0.00010675321391317993, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/mean_length": 485.5625, "completions/min_length": 418.0, "epoch": 7.5323529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.7447341084480286, "kl": 0.007158580236136913, "learning_rate": 7.844709245367766e-07, "loss": 7.180124521255493e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 434.375, "completions/min_length": 372.0, "epoch": 7.533823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.8316693305969238, "kl": 0.01045550568960607, "learning_rate": 7.843653769360234e-07, "loss": 0.00010486094106454402, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 407.8125, "completions/min_length": 342.0, "epoch": 7.535294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.1231732368469238, "kl": 0.008788994629867375, "learning_rate": 7.842598106019991e-07, "loss": 8.720904588699341e-05, "reward": 0.793749988079071, "reward_std": 0.35761818289756775, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 496.0625, "completions/min_length": 447.0, "epoch": 7.536764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.700487494468689, "kl": 0.007930160500109196, "learning_rate": 7.841542255416578e-07, "loss": 7.934380118967965e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 437.375, "completions/min_length": 399.0, "epoch": 7.538235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.012921384535729885, "kl": 0.009354667039588094, "learning_rate": 7.840486217619552e-07, "loss": 9.322649566456676e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 472.375, "completions/min_length": 429.0, "epoch": 7.5397058823529415, "frac_reward_zero_std": 1.0, "grad_norm": 0.014721748419106007, "kl": 0.008018766297027469, "learning_rate": 7.839429992698483e-07, "loss": 7.97626853454858e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 433.8125, "completions/min_length": 344.0, "epoch": 7.541176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.35329562425613403, "kl": 0.013164603849872947, "learning_rate": 7.83837358072295e-07, "loss": 0.00012881483417004347, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/mean_length": 466.125, "completions/min_length": 416.0, "epoch": 7.54264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0987893342971802, "kl": 0.010251654894091189, "learning_rate": 7.83731698176255e-07, "loss": 0.00010231736087007448, "reward": 0.6491071581840515, "reward_std": 0.20204630494117737, "rewards/DrugCombAccuracyCOTORM/mean": 0.6160714626312256, "rewards/DrugCombAccuracyCOTORM/std": 0.49409782886505127, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.7274384498596191, "step": 5129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/mean_length": 406.375, "completions/min_length": 329.0, "epoch": 7.544117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.016952797770500183, "kl": 0.011456478736363351, "learning_rate": 7.836260195886888e-07, "loss": 0.00011267482477705926, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 489.1875, "completions/min_length": 408.0, "epoch": 7.545588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.016478879377245903, "kl": 0.00791526457760483, "learning_rate": 7.83520322316558e-07, "loss": 7.970737351570278e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 441.0625, "completions/min_length": 420.0, "epoch": 7.547058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.012493038550019264, "kl": 0.006868327269330621, "learning_rate": 7.83414606366826e-07, "loss": 6.92519242875278e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 454.0625, "completions/min_length": 400.0, "epoch": 7.548529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.010865925811231136, "kl": 0.007041894947178662, "learning_rate": 7.833088717464569e-07, "loss": 7.047719554975629e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 475.625, "completions/min_length": 404.0, "epoch": 7.55, "frac_reward_zero_std": 0.5, "grad_norm": 0.9102696180343628, "kl": 0.011377193266525865, "learning_rate": 7.832031184624164e-07, "loss": 0.00011389702558517456, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 460.3125, "completions/min_length": 416.0, "epoch": 7.551470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.018326517194509506, "kl": 0.008293347549624741, "learning_rate": 7.83097346521671e-07, "loss": 8.292769780382514e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/mean_length": 471.25, "completions/min_length": 348.0, "epoch": 7.552941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.8031493425369263, "kl": 0.016098643885925412, "learning_rate": 7.829915559311891e-07, "loss": 0.00015302002429962158, "reward": 0.7734375, "reward_std": 0.41953861713409424, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 5136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/mean_length": 414.0625, "completions/min_length": 374.0, "epoch": 7.554411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.8444445729255676, "kl": 0.008758786483667791, "learning_rate": 7.828857466979394e-07, "loss": 8.737953612580895e-05, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 5137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 507.0625, "completions/min_length": 459.0, "epoch": 7.555882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 1.2869634628295898, "kl": 0.010685695335268974, "learning_rate": 7.827799188288926e-07, "loss": 0.00010637938976287842, "reward": 0.8374999761581421, "reward_std": 0.35143834352493286, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 423.5, "completions/min_length": 369.0, "epoch": 7.557352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.014112493023276329, "kl": 0.009203035500831902, "learning_rate": 7.826740723310206e-07, "loss": 9.267382847610861e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/mean_length": 525.5, "completions/min_length": 435.0, "epoch": 7.5588235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 0.8326918482780457, "kl": 0.012207050807774067, "learning_rate": 7.825682072112959e-07, "loss": 0.0001207888126373291, "reward": 0.8722000122070312, "reward_std": 0.07887986302375793, "rewards/DrugCombAccuracyCOTORM/mean": 0.8496249914169312, "rewards/DrugCombAccuracyCOTORM/std": 0.2005000114440918, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.925000011920929, "rewards/DrugCombCoverageCOTORM/std": 0.09999999403953552, "step": 5140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 448.0, "completions/min_length": 391.0, "epoch": 7.560294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.01074399147182703, "kl": 0.008354834979400039, "learning_rate": 7.824623234766928e-07, "loss": 8.323673682752997e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 461.125, "completions/min_length": 390.0, "epoch": 7.561764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.013625524006783962, "kl": 0.008940293919295073, "learning_rate": 7.823564211341868e-07, "loss": 8.882758993422613e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 473.5, "completions/min_length": 409.0, "epoch": 7.563235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1874853372573853, "kl": 0.011462009511888027, "learning_rate": 7.822505001907543e-07, "loss": 0.00011461973190307617, "reward": 0.5178333520889282, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.4025000035762787, "rewards/DrugCombAccuracyCOTORM/std": 0.4833701252937317, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 5143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 475.5625, "completions/min_length": 379.0, "epoch": 7.564705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.8411136269569397, "kl": 0.010271470295265317, "learning_rate": 7.82144560653373e-07, "loss": 0.00010234862565994263, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 428.9375, "completions/min_length": 383.0, "epoch": 7.5661764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.01230248436331749, "kl": 0.009126762044616044, "learning_rate": 7.820386025290221e-07, "loss": 9.099744056584314e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 430.375, "completions/min_length": 377.0, "epoch": 7.567647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.02463124319911003, "kl": 0.011823299573734403, "learning_rate": 7.819326258246818e-07, "loss": 0.00011769424600061029, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 500.125, "completions/min_length": 393.0, "epoch": 7.569117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 1.5501400232315063, "kl": 0.014183902880176902, "learning_rate": 7.818266305473335e-07, "loss": 0.00014092028141021729, "reward": 0.5367375612258911, "reward_std": 0.291622519493103, "rewards/DrugCombAccuracyCOTORM/mean": 0.43849998712539673, "rewards/DrugCombAccuracyCOTORM/std": 0.4256190359592438, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.859375, "rewards/DrugCombCoverageCOTORM/std": 0.16865138709545135, "step": 5147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/mean_length": 501.75, "completions/min_length": 388.0, "epoch": 7.570588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.8295066952705383, "kl": 0.009648811537772417, "learning_rate": 7.817206167039603e-07, "loss": 9.737977961776778e-05, "reward": 0.5686249732971191, "reward_std": 0.035596705973148346, "rewards/DrugCombAccuracyCOTORM/mean": 0.5154687762260437, "rewards/DrugCombAccuracyCOTORM/std": 0.5021754503250122, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 5148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 446.8125, "completions/min_length": 391.0, "epoch": 7.572058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.04261152818799019, "kl": 0.011540639447048306, "learning_rate": 7.816145843015456e-07, "loss": 0.0001145423884736374, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 467.1875, "completions/min_length": 396.0, "epoch": 7.573529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.1720645427703857, "kl": 0.011389417108148336, "learning_rate": 7.815085333470748e-07, "loss": 0.00011304765939712524, "reward": 0.42500001192092896, "reward_std": 0.38195645809173584, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 5150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 465.1875, "completions/min_length": 390.0, "epoch": 7.575, "frac_reward_zero_std": 1.0, "grad_norm": 0.014186042360961437, "kl": 0.009944126708433032, "learning_rate": 7.814024638475343e-07, "loss": 9.926898928824812e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 452.0, "completions/min_length": 410.0, "epoch": 7.576470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.0115840435028076, "kl": 0.012342444621026516, "learning_rate": 7.812963758099115e-07, "loss": 0.00012280046939849854, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 5152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 447.1875, "completions/min_length": 385.0, "epoch": 7.577941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.011202922090888023, "kl": 0.009625215199775994, "learning_rate": 7.811902692411954e-07, "loss": 9.64310165727511e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 5153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 474.375, "completions/min_length": 416.0, "epoch": 7.579411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.8592169880867004, "kl": 0.012358098989352584, "learning_rate": 7.810841441483759e-07, "loss": 0.0001245463645318523, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 400.4375, "completions/min_length": 378.0, "epoch": 7.580882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.011309577152132988, "kl": 0.007457243395037949, "learning_rate": 7.809780005384445e-07, "loss": 7.479204214178026e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 489.3125, "completions/min_length": 413.0, "epoch": 7.58235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9204642176628113, "kl": 0.010056418599560857, "learning_rate": 7.808718384183933e-07, "loss": 9.994208812713623e-05, "reward": 0.5820000171661377, "reward_std": 0.08818595111370087, "rewards/DrugCombAccuracyCOTORM/mean": 0.5504166483879089, "rewards/DrugCombAccuracyCOTORM/std": 0.47041648626327515, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.8563488721847534, "step": 5156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 482.875, "completions/min_length": 421.0, "epoch": 7.583823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.02009808085858822, "kl": 0.011911016306839883, "learning_rate": 7.807656577952163e-07, "loss": 0.00011990319762844592, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 443.1875, "completions/min_length": 360.0, "epoch": 7.5852941176470585, "frac_reward_zero_std": 0.5, "grad_norm": 0.8872495889663696, "kl": 0.012394048273563385, "learning_rate": 7.806594586759084e-07, "loss": 0.00012206408428028226, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 5158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 449.8125, "completions/min_length": 419.0, "epoch": 7.586764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8407547473907471, "kl": 0.01124901114962995, "learning_rate": 7.805532410674653e-07, "loss": 0.00011240955791436136, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 447.8125, "completions/min_length": 360.0, "epoch": 7.588235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.2429821491241455, "kl": 0.01219906541518867, "learning_rate": 7.804470049768849e-07, "loss": 0.0001224882435053587, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 5160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 441.1875, "completions/min_length": 394.0, "epoch": 7.589705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.1283890008926392, "kl": 0.008812902960926294, "learning_rate": 7.803407504111654e-07, "loss": 8.784990495769307e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 5161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 458.25, "completions/min_length": 391.0, "epoch": 7.591176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.033660903573036194, "kl": 0.010095163946971297, "learning_rate": 7.802344773773069e-07, "loss": 0.00010150588059332222, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 452.25, "completions/min_length": 388.0, "epoch": 7.5926470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.0155799388885498, "kl": 0.010797975119203329, "learning_rate": 7.801281858823101e-07, "loss": 0.00010757893323898315, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 5163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 442.5, "completions/min_length": 393.0, "epoch": 7.594117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.034325726330280304, "kl": 0.010242709307931364, "learning_rate": 7.800218759331774e-07, "loss": 0.00010292480874340981, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 435.4375, "completions/min_length": 382.0, "epoch": 7.595588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.0114436149597168, "kl": 0.010238176211714745, "learning_rate": 7.79915547536912e-07, "loss": 0.00010133534669876099, "reward": 0.3499999940395355, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.1875, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 415.5, "completions/min_length": 360.0, "epoch": 7.597058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.016872959211468697, "kl": 0.012290989980101585, "learning_rate": 7.798092007005189e-07, "loss": 0.00012335239443928003, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 468.75, "completions/min_length": 417.0, "epoch": 7.598529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.0474299192428589, "kl": 0.009454595274291933, "learning_rate": 7.797028354310035e-07, "loss": 9.474946273257956e-05, "reward": 0.7875000238418579, "reward_std": 0.2295181304216385, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 429.4375, "completions/min_length": 380.0, "epoch": 7.6, "frac_reward_zero_std": 0.5, "grad_norm": 0.8924087285995483, "kl": 0.009140008711256087, "learning_rate": 7.795964517353733e-07, "loss": 9.110569953918457e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 433.4375, "completions/min_length": 373.0, "epoch": 7.601470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.2286462783813477, "kl": 0.010081445565447211, "learning_rate": 7.794900496206365e-07, "loss": 0.00010007871605921537, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 463.875, "completions/min_length": 425.0, "epoch": 7.602941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.024210810661316, "kl": 0.00914312235545367, "learning_rate": 7.793836290938025e-07, "loss": 9.144516661763191e-05, "reward": 0.8687499761581421, "reward_std": 0.22825033962726593, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.704154372215271, "step": 5170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 442.5, "completions/min_length": 405.0, "epoch": 7.604411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.019457198679447174, "kl": 0.010635752696543932, "learning_rate": 7.79277190161882e-07, "loss": 0.00010596220090519637, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 446.0, "completions/min_length": 350.0, "epoch": 7.605882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.013300462625920773, "kl": 0.007947937934659421, "learning_rate": 7.791707328318871e-07, "loss": 7.92987848399207e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/mean_length": 504.9375, "completions/min_length": 382.0, "epoch": 7.607352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.37159767746925354, "kl": 0.021509865997359157, "learning_rate": 7.790642571108307e-07, "loss": 0.00021598604507744312, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 5173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 460.875, "completions/min_length": 395.0, "epoch": 7.608823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.018896250054240227, "kl": 0.010020573390647769, "learning_rate": 7.789577630057273e-07, "loss": 0.00010101888619828969, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 440.875, "completions/min_length": 349.0, "epoch": 7.610294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.024283546954393387, "kl": 0.012944957474246621, "learning_rate": 7.788512505235923e-07, "loss": 0.00012823806900996715, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 436.875, "completions/min_length": 395.0, "epoch": 7.6117647058823525, "frac_reward_zero_std": 1.0, "grad_norm": 0.39078009128570557, "kl": 0.023475574096664786, "learning_rate": 7.787447196714428e-07, "loss": 0.00023171649081632495, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 433.875, "completions/min_length": 369.0, "epoch": 7.613235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01102786511182785, "kl": 0.007747451309114695, "learning_rate": 7.786381704562965e-07, "loss": 7.776429993100464e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 431.875, "completions/min_length": 356.0, "epoch": 7.614705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.013727279379963875, "kl": 0.0095351398922503, "learning_rate": 7.785316028851726e-07, "loss": 9.537673759041354e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 453.0, "completions/min_length": 391.0, "epoch": 7.616176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.1929848194122314, "kl": 0.011696779634803534, "learning_rate": 7.784250169650917e-07, "loss": 0.00011670309322653338, "reward": 0.5562499761581421, "reward_std": 0.01767767034471035, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 5179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 453.9375, "completions/min_length": 408.0, "epoch": 7.617647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.01703491061925888, "kl": 0.008671427494846284, "learning_rate": 7.783184127030752e-07, "loss": 8.712995622772723e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/mean_length": 516.625, "completions/min_length": 429.0, "epoch": 7.6191176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.063190221786499, "kl": 0.012623211834579706, "learning_rate": 7.78211790106146e-07, "loss": 0.00012565009819809347, "reward": 0.7659527659416199, "reward_std": 0.20469573140144348, "rewards/DrugCombAccuracyCOTORM/mean": 0.7235000133514404, "rewards/DrugCombAccuracyCOTORM/std": 0.43751269578933716, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8715277910232544, "rewards/DrugCombCoverageCOTORM/std": 0.25232967734336853, "step": 5181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 471.25, "completions/min_length": 429.0, "epoch": 7.620588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.6337201595306396, "kl": 0.01125069591216743, "learning_rate": 7.781051491813282e-07, "loss": 0.00011108815670013428, "reward": 0.7875000238418579, "reward_std": 0.3934735357761383, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 5182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 450.875, "completions/min_length": 382.0, "epoch": 7.622058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.2857775688171387, "kl": 0.011521230335347354, "learning_rate": 7.779984899356469e-07, "loss": 0.00011564448504941538, "reward": 0.737500011920929, "reward_std": 0.2199837565422058, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/mean_length": 498.125, "completions/min_length": 424.0, "epoch": 7.623529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.0174641609191895, "kl": 0.017173580825328827, "learning_rate": 7.778918123761286e-07, "loss": 0.00016938894987106323, "reward": 0.8605312705039978, "reward_std": 0.19623175263404846, "rewards/DrugCombAccuracyCOTORM/mean": 0.8305468559265137, "rewards/DrugCombAccuracyCOTORM/std": 0.36936402320861816, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9609375, "rewards/DrugCombCoverageCOTORM/std": 0.12680982053279877, "step": 5184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/mean_length": 514.25, "completions/min_length": 389.0, "epoch": 7.625, "frac_reward_zero_std": 0.5, "grad_norm": 0.9685335755348206, "kl": 0.010036640102043748, "learning_rate": 7.777851165098011e-07, "loss": 0.00010058627958642319, "reward": 0.9333333373069763, "reward_std": 0.07126964628696442, "rewards/DrugCombAccuracyCOTORM/mean": 0.9166666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.14907118678092957, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/mean_length": 405.5625, "completions/min_length": 356.0, "epoch": 7.626470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.032251000404358, "kl": 0.008155907038599253, "learning_rate": 7.776784023436929e-07, "loss": 8.130056812660769e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 5186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 455.5625, "completions/min_length": 390.0, "epoch": 7.627941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.8874801397323608, "kl": 0.011339367367327213, "learning_rate": 7.775716698848345e-07, "loss": 0.00011332194117130712, "reward": 0.6316666603088379, "reward_std": 0.16275669634342194, "rewards/DrugCombAccuracyCOTORM/mean": 0.596875011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.47628381848335266, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5416666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.6952218413352966, "step": 5187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 446.25, "completions/min_length": 370.0, "epoch": 7.629411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.01349125150591135, "kl": 0.010478576878085732, "learning_rate": 7.774649191402568e-07, "loss": 0.00010581909737084061, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 432.375, "completions/min_length": 389.0, "epoch": 7.6308823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.108896255493164, "kl": 0.009281438775360584, "learning_rate": 7.773581501169925e-07, "loss": 9.23996776691638e-05, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 5189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/mean_length": 487.375, "completions/min_length": 401.0, "epoch": 7.632352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.3713022470474243, "kl": 0.015764540759846568, "learning_rate": 7.772513628220751e-07, "loss": 0.00015660375356674194, "reward": 0.8187500238418579, "reward_std": 0.38343703746795654, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.6020797491073608, "step": 5190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 440.25, "completions/min_length": 341.0, "epoch": 7.633823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.3052926063537598, "kl": 0.009313456946983933, "learning_rate": 7.771445572625394e-07, "loss": 9.448826313018799e-05, "reward": 0.9879167079925537, "reward_std": 0.03417681157588959, "rewards/DrugCombAccuracyCOTORM/mean": 0.987500011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.05000000074505806, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 5191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/mean_length": 548.625, "completions/min_length": 449.0, "epoch": 7.635294117647058, "frac_reward_zero_std": 0.0, "grad_norm": 1.6279743909835815, "kl": 0.011565777938812971, "learning_rate": 7.770377334454219e-07, "loss": 0.0001176595687866211, "reward": 0.7124999761581421, "reward_std": 0.29083049297332764, "rewards/DrugCombAccuracyCOTORM/mean": 0.66015625, "rewards/DrugCombAccuracyCOTORM/std": 0.3186850845813751, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.4961658716201782, "step": 5192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 432.8125, "completions/min_length": 366.0, "epoch": 7.636764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.029604675248265266, "kl": 0.0068565685069188476, "learning_rate": 7.769308913777595e-07, "loss": 6.86203857185319e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 445.4375, "completions/min_length": 377.0, "epoch": 7.6382352941176475, "frac_reward_zero_std": 0.5, "grad_norm": 1.3283486366271973, "kl": 0.01154896488878876, "learning_rate": 7.768240310665908e-07, "loss": 0.00011629371147137135, "reward": 0.925000011920929, "reward_std": 0.1035098284482956, "rewards/DrugCombAccuracyCOTORM/mean": 0.90625, "rewards/DrugCombAccuracyCOTORM/std": 0.20155644416809082, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 476.75, "completions/min_length": 438.0, "epoch": 7.639705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.011074593290686607, "kl": 0.009452034253627062, "learning_rate": 7.767171525189556e-07, "loss": 9.480701555730775e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 453.875, "completions/min_length": 418.0, "epoch": 7.641176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.00802549533545971, "kl": 0.006597819156013429, "learning_rate": 7.766102557418945e-07, "loss": 6.609443516936153e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 5196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 412.0625, "completions/min_length": 369.0, "epoch": 7.642647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.010458783246576786, "kl": 0.008856420638039708, "learning_rate": 7.765033407424498e-07, "loss": 8.815020555630326e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 447.4375, "completions/min_length": 376.0, "epoch": 7.644117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.06587616354227066, "kl": 0.00923940516076982, "learning_rate": 7.763964075276648e-07, "loss": 9.193014557240531e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 449.375, "completions/min_length": 402.0, "epoch": 7.645588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.03109140694141388, "kl": 0.007712892140261829, "learning_rate": 7.76289456104584e-07, "loss": 7.694890518905595e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 399.4375, "completions/min_length": 363.0, "epoch": 7.647058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.953952968120575, "kl": 0.009170255274511874, "learning_rate": 7.761824864802528e-07, "loss": 9.074722038349137e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 404.5625, "completions/min_length": 330.0, "epoch": 7.648529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.045685190707445145, "kl": 0.010078249499201775, "learning_rate": 7.760754986617186e-07, "loss": 9.801032138057053e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 476.4375, "completions/min_length": 414.0, "epoch": 7.65, "frac_reward_zero_std": 1.0, "grad_norm": 0.023275531828403473, "kl": 0.009006089647300541, "learning_rate": 7.75968492656029e-07, "loss": 9.017010597744957e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 425.8125, "completions/min_length": 380.0, "epoch": 7.651470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.012243211269378662, "kl": 0.009974258719012141, "learning_rate": 7.758614684702336e-07, "loss": 9.991628758143634e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 458.4375, "completions/min_length": 401.0, "epoch": 7.652941176470589, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234161615371704, "kl": 0.011339775519445539, "learning_rate": 7.757544261113828e-07, "loss": 0.00011354684829711914, "reward": 0.40625, "reward_std": 0.26486268639564514, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 0.875, "rewards/DrugCombCOTFormatORM/std": 0.22360680997371674, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 0.8850612044334412, "step": 5204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 454.375, "completions/min_length": 358.0, "epoch": 7.654411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.01338045485317707, "kl": 0.008885524235665798, "learning_rate": 7.756473655865283e-07, "loss": 8.828542195260525e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 466.4375, "completions/min_length": 402.0, "epoch": 7.655882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.0414727926254272, "kl": 0.010792286600917578, "learning_rate": 7.755402869027228e-07, "loss": 0.00010848937381524593, "reward": 0.9056999683380127, "reward_std": 0.17460967600345612, "rewards/DrugCombAccuracyCOTORM/mean": 0.8914999961853027, "rewards/DrugCombAccuracyCOTORM/std": 0.2964784502983093, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.925000011920929, "rewards/DrugCombCoverageCOTORM/std": 0.20493900775909424, "step": 5206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/mean_length": 528.125, "completions/min_length": 442.0, "epoch": 7.6573529411764705, "frac_reward_zero_std": 0.0, "grad_norm": 1.2294707298278809, "kl": 0.009486292721703649, "learning_rate": 7.754331900670206e-07, "loss": 9.537115693092346e-05, "reward": 0.4820833206176758, "reward_std": 0.3660421371459961, "rewards/DrugCombAccuracyCOTORM/mean": 0.3812500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.44093653559684753, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7708333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.49767982959747314, "step": 5207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 429.25, "completions/min_length": 399.0, "epoch": 7.658823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.01941836252808571, "kl": 0.009517116472125053, "learning_rate": 7.753260750864767e-07, "loss": 9.533306729281321e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 477.8125, "completions/min_length": 452.0, "epoch": 7.660294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9893541932106018, "kl": 0.014284926233813167, "learning_rate": 7.75218941968148e-07, "loss": 0.00014323728100862354, "reward": 0.7565000057220459, "reward_std": 0.0910470113158226, "rewards/DrugCombAccuracyCOTORM/mean": 0.721666693687439, "rewards/DrugCombAccuracyCOTORM/std": 0.3305080235004425, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.22360680997371674, "step": 5209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/mean_length": 455.5625, "completions/min_length": 378.0, "epoch": 7.661764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9344244599342346, "kl": 0.010051565011963248, "learning_rate": 7.751117907190917e-07, "loss": 0.00010067969560623169, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 467.9375, "completions/min_length": 410.0, "epoch": 7.663235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.773043155670166, "kl": 0.008315120358020067, "learning_rate": 7.75004621346367e-07, "loss": 8.305872324854136e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 5211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 490.9375, "completions/min_length": 448.0, "epoch": 7.6647058823529415, "frac_reward_zero_std": 0.0, "grad_norm": 1.303823709487915, "kl": 0.011106028803624213, "learning_rate": 7.748974338570335e-07, "loss": 0.00011141598224639893, "reward": 0.6000000238418579, "reward_std": 0.2828426957130432, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 476.5, "completions/min_length": 406.0, "epoch": 7.666176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 1.3340846300125122, "kl": 0.010700888000428677, "learning_rate": 7.747902282581531e-07, "loss": 0.00010660290718078613, "reward": 0.7604166269302368, "reward_std": 0.29995816946029663, "rewards/DrugCombAccuracyCOTORM/mean": 0.7708333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.3381595313549042, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.8732125163078308, "step": 5213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/mean_length": 421.125, "completions/min_length": 383.0, "epoch": 7.66764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.01512499526143074, "kl": 0.007477210368961096, "learning_rate": 7.746830045567877e-07, "loss": 7.492417353205383e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 434.375, "completions/min_length": 392.0, "epoch": 7.669117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.04067664220929146, "kl": 0.011368130566552281, "learning_rate": 7.745757627600012e-07, "loss": 0.00011470422032289207, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 469.625, "completions/min_length": 417.0, "epoch": 7.670588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.012321306392550468, "kl": 0.006068775430321693, "learning_rate": 7.744685028748581e-07, "loss": 6.106386717874557e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 459.8125, "completions/min_length": 392.0, "epoch": 7.672058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.0614855289459229, "kl": 0.010338420164771378, "learning_rate": 7.743612249084248e-07, "loss": 0.00010389462113380432, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 418.4375, "completions/min_length": 392.0, "epoch": 7.673529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.01068237517029047, "kl": 0.009036563104018569, "learning_rate": 7.742539288677682e-07, "loss": 9.046123159350827e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 465.3125, "completions/min_length": 433.0, "epoch": 7.675, "frac_reward_zero_std": 0.5, "grad_norm": 1.022877812385559, "kl": 0.010239686584100127, "learning_rate": 7.741466147599569e-07, "loss": 0.00010262052819598466, "reward": 0.960812509059906, "reward_std": 0.11083899438381195, "rewards/DrugCombAccuracyCOTORM/mean": 0.9529687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.18812499940395355, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.0625, "step": 5219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 442.75, "completions/min_length": 382.0, "epoch": 7.676470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.096605658531189, "kl": 0.016344639007002115, "learning_rate": 7.740392825920604e-07, "loss": 0.00015442073345184326, "reward": 0.8662500381469727, "reward_std": 0.1857101172208786, "rewards/DrugCombAccuracyCOTORM/mean": 0.8354166746139526, "rewards/DrugCombAccuracyCOTORM/std": 0.35619986057281494, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 5220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/mean_length": 435.625, "completions/min_length": 305.0, "epoch": 7.677941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9647843837738037, "kl": 0.012226980295963585, "learning_rate": 7.739319323711495e-07, "loss": 0.00011852254101540893, "reward": 0.7875000238418579, "reward_std": 0.14658820629119873, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.3333333432674408, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 5221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 470.0, "completions/min_length": 397.0, "epoch": 7.679411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.0047128200531006, "kl": 0.009920950629748404, "learning_rate": 7.73824564104296e-07, "loss": 9.987875819206238e-05, "reward": 0.7489374876022339, "reward_std": 0.17115013301372528, "rewards/DrugCombAccuracyCOTORM/mean": 0.7213281393051147, "rewards/DrugCombAccuracyCOTORM/std": 0.39168497920036316, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.71875, "rewards/DrugCombCoverageCOTORM/std": 0.5153881907463074, "step": 5222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/mean_length": 512.125, "completions/min_length": 437.0, "epoch": 7.680882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.0468326807022095, "kl": 0.011472183396108449, "learning_rate": 7.737171777985734e-07, "loss": 0.0001136285500251688, "reward": 0.16981305181980133, "reward_std": 0.13789516687393188, "rewards/DrugCombAccuracyCOTORM/mean": 0.11330796778202057, "rewards/DrugCombAccuracyCOTORM/std": 0.24889540672302246, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.2083333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.8290557265281677, "step": 5223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 489.1875, "completions/min_length": 451.0, "epoch": 7.682352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.9777845144271851, "kl": 0.010894726496189833, "learning_rate": 7.736097734610556e-07, "loss": 0.00010913769074250013, "reward": 0.7749999761581421, "reward_std": 0.24348658323287964, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 5224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 419.6875, "completions/min_length": 393.0, "epoch": 7.6838235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.024646371603012085, "kl": 0.012783406302332878, "learning_rate": 7.735023510988183e-07, "loss": 0.0001261171419173479, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/mean_length": 472.4375, "completions/min_length": 378.0, "epoch": 7.685294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.910378098487854, "kl": 0.00919380015693605, "learning_rate": 7.733949107189385e-07, "loss": 9.320676326751709e-05, "reward": 0.6395833492279053, "reward_std": 0.11740836501121521, "rewards/DrugCombAccuracyCOTORM/mean": 0.5729166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.4790761172771454, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 5226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 483.875, "completions/min_length": 440.0, "epoch": 7.686764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.888312041759491, "kl": 0.008593694306910038, "learning_rate": 7.732874523284938e-07, "loss": 8.56742262840271e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 443.0, "completions/min_length": 398.0, "epoch": 7.688235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.3095417022705078, "kl": 0.011611149180680513, "learning_rate": 7.731799759345632e-07, "loss": 0.00011610641377046704, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/mean_length": 479.25, "completions/min_length": 381.0, "epoch": 7.689705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.02089034765958786, "kl": 0.008880491252057254, "learning_rate": 7.730724815442272e-07, "loss": 8.933973731473088e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 445.1875, "completions/min_length": 397.0, "epoch": 7.6911764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.013533428311347961, "kl": 0.008422595681622624, "learning_rate": 7.729649691645672e-07, "loss": 8.456414798274636e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 493.3125, "completions/min_length": 426.0, "epoch": 7.692647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.0632635354995728, "kl": 0.011768292402848601, "learning_rate": 7.728574388026658e-07, "loss": 0.00011759251356124878, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 421.25, "completions/min_length": 365.0, "epoch": 7.694117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.011838898994028568, "kl": 0.008171049179509282, "learning_rate": 7.727498904656069e-07, "loss": 8.138152043102309e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 449.125, "completions/min_length": 383.0, "epoch": 7.695588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.1805005073547363, "kl": 0.011200550361536443, "learning_rate": 7.726423241604754e-07, "loss": 0.00011239945888519287, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 433.4375, "completions/min_length": 395.0, "epoch": 7.697058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.9639453887939453, "kl": 0.013067757477983832, "learning_rate": 7.725347398943576e-07, "loss": 0.0001312345266342163, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 800.0, "completions/mean_length": 558.625, "completions/min_length": 435.0, "epoch": 7.698529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9743057489395142, "kl": 0.011466290103271604, "learning_rate": 7.724271376743408e-07, "loss": 0.00011442405957495794, "reward": 0.583559513092041, "reward_std": 0.1293673813343048, "rewards/DrugCombAccuracyCOTORM/mean": 0.543958306312561, "rewards/DrugCombAccuracyCOTORM/std": 0.5006946325302124, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4839285612106323, "rewards/DrugCombCoverageCOTORM/std": 0.8871244788169861, "step": 5235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/mean_length": 480.0, "completions/min_length": 416.0, "epoch": 7.7, "frac_reward_zero_std": 0.5, "grad_norm": 0.7211481928825378, "kl": 0.009520876104943454, "learning_rate": 7.723195175075135e-07, "loss": 9.490549564361572e-05, "reward": 0.942187488079071, "reward_std": 0.16351844370365143, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 499.375, "completions/min_length": 444.0, "epoch": 7.701470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.3193141222000122, "kl": 0.01043960521928966, "learning_rate": 7.722118794009657e-07, "loss": 0.00010515004396438599, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 436.125, "completions/min_length": 391.0, "epoch": 7.702941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.014616774395108223, "kl": 0.008859887020662427, "learning_rate": 7.72104223361788e-07, "loss": 8.837762288749218e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 472.625, "completions/min_length": 416.0, "epoch": 7.704411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.017282843589782715, "kl": 0.010504056001082063, "learning_rate": 7.719965493970728e-07, "loss": 0.00010483957885298878, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 430.5625, "completions/min_length": 375.0, "epoch": 7.705882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.009961090981960297, "kl": 0.009101953357458115, "learning_rate": 7.718888575139133e-07, "loss": 9.037579729920253e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 425.3125, "completions/min_length": 388.0, "epoch": 7.70735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0596270561218262, "kl": 0.010796665214002132, "learning_rate": 7.717811477194038e-07, "loss": 0.00010741539153968915, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 5241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/mean_length": 484.875, "completions/min_length": 383.0, "epoch": 7.708823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.5734327435493469, "kl": 0.024808624759316444, "learning_rate": 7.716734200206403e-07, "loss": 0.00024490937357768416, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 5242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 458.1875, "completions/min_length": 392.0, "epoch": 7.7102941176470585, "frac_reward_zero_std": 0.5, "grad_norm": 0.7607507109642029, "kl": 0.009114990942180157, "learning_rate": 7.715656744247192e-07, "loss": 9.1335634351708e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 480.3125, "completions/min_length": 424.0, "epoch": 7.711764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.5214505195617676, "kl": 0.013537748483940959, "learning_rate": 7.714579109387388e-07, "loss": 0.00013527274131774902, "reward": 0.5018541812896729, "reward_std": 0.23936329782009125, "rewards/DrugCombAccuracyCOTORM/mean": 0.4391666650772095, "rewards/DrugCombAccuracyCOTORM/std": 0.4520103335380554, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5208333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 5244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 464.9375, "completions/min_length": 417.0, "epoch": 7.713235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.02279525250196457, "kl": 0.011618151562288404, "learning_rate": 7.713501295697982e-07, "loss": 0.00011536020610947162, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 471.25, "completions/min_length": 397.0, "epoch": 7.714705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.017496541142463684, "kl": 0.010208583436906338, "learning_rate": 7.712423303249977e-07, "loss": 0.00010160425154026598, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 447.9375, "completions/min_length": 400.0, "epoch": 7.716176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9498282074928284, "kl": 0.009088714723475277, "learning_rate": 7.711345132114391e-07, "loss": 9.234856406692415e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 473.5, "completions/min_length": 414.0, "epoch": 7.7176470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.9480314254760742, "kl": 0.010101670166477561, "learning_rate": 7.710266782362247e-07, "loss": 0.00010119099169969559, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 470.6875, "completions/min_length": 417.0, "epoch": 7.719117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.019454505294561386, "kl": 0.009630927816033363, "learning_rate": 7.709188254064587e-07, "loss": 9.631298598833382e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/mean_length": 543.1875, "completions/min_length": 436.0, "epoch": 7.720588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.0682133436203003, "kl": 0.01084240828640759, "learning_rate": 7.70810954729246e-07, "loss": 0.00010900120832957327, "reward": 0.7893452644348145, "reward_std": 0.14338181912899017, "rewards/DrugCombAccuracyCOTORM/mean": 0.7497023940086365, "rewards/DrugCombAccuracyCOTORM/std": 0.3461994230747223, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.23471809923648834, "step": 5250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 425.4375, "completions/min_length": 386.0, "epoch": 7.722058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.00988252367824316, "kl": 0.009344282327219844, "learning_rate": 7.707030662116931e-07, "loss": 9.303644037572667e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 496.625, "completions/min_length": 421.0, "epoch": 7.723529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.316171646118164, "kl": 0.014472435228526592, "learning_rate": 7.705951598609071e-07, "loss": 0.000143345445394516, "reward": 0.5761125087738037, "reward_std": 0.24498513340950012, "rewards/DrugCombAccuracyCOTORM/mean": 0.5041249990463257, "rewards/DrugCombAccuracyCOTORM/std": 0.38041648268699646, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7281249761581421, "rewards/DrugCombCoverageCOTORM/std": 0.4837246835231781, "step": 5252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 485.3125, "completions/min_length": 402.0, "epoch": 7.725, "frac_reward_zero_std": 0.5, "grad_norm": 0.8833672404289246, "kl": 0.00856466859113425, "learning_rate": 7.704872356839969e-07, "loss": 8.591043297201395e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 5253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 457.25, "completions/min_length": 419.0, "epoch": 7.726470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8082074522972107, "kl": 0.00901076674927026, "learning_rate": 7.703792936880722e-07, "loss": 8.985772728919983e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 478.625, "completions/min_length": 401.0, "epoch": 7.727941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.01145168673247099, "kl": 0.009695991175249219, "learning_rate": 7.702713338802437e-07, "loss": 9.749840683070943e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 454.75, "completions/min_length": 407.0, "epoch": 7.729411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.02509002760052681, "kl": 0.010512698674574494, "learning_rate": 7.701633562676238e-07, "loss": 0.00010467453830642626, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/mean_length": 492.5, "completions/min_length": 394.0, "epoch": 7.730882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.8343645930290222, "kl": 0.008607449242845178, "learning_rate": 7.700553608573257e-07, "loss": 8.65346155478619e-05, "reward": 0.9102500081062317, "reward_std": 0.1661846935749054, "rewards/DrugCombAccuracyCOTORM/mean": 0.8956249952316284, "rewards/DrugCombAccuracyCOTORM/std": 0.28520679473876953, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.17078252136707306, "step": 5257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 492.6875, "completions/min_length": 428.0, "epoch": 7.732352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.926531970500946, "kl": 0.010420798789709806, "learning_rate": 7.699473476564637e-07, "loss": 0.00010404484783066437, "reward": 0.59375, "reward_std": 0.0176776684820652, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 448.9375, "completions/min_length": 356.0, "epoch": 7.733823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.014654286205768585, "kl": 0.011147516896016896, "learning_rate": 7.698393166721537e-07, "loss": 0.00011199139407835901, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 459.9375, "completions/min_length": 407.0, "epoch": 7.735294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.016956590116024017, "kl": 0.008358531980775297, "learning_rate": 7.697312679115124e-07, "loss": 8.34183520055376e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 452.9375, "completions/min_length": 373.0, "epoch": 7.7367647058823525, "frac_reward_zero_std": 1.0, "grad_norm": 0.012514056637883186, "kl": 0.008870617835782468, "learning_rate": 7.696232013816577e-07, "loss": 8.888336014933884e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 439.125, "completions/min_length": 344.0, "epoch": 7.738235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8427556157112122, "kl": 0.012499647215008736, "learning_rate": 7.695151170897089e-07, "loss": 0.00012752413749694824, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 446.625, "completions/min_length": 406.0, "epoch": 7.739705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 0.851347029209137, "kl": 0.010058340965770185, "learning_rate": 7.694070150427863e-07, "loss": 9.972602128982544e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 427.625, "completions/min_length": 380.0, "epoch": 7.741176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.03277686983346939, "kl": 0.013345714658498764, "learning_rate": 7.692988952480112e-07, "loss": 0.00013230870536062866, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/mean_length": 468.8125, "completions/min_length": 362.0, "epoch": 7.742647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.0153514314442873, "kl": 0.0093431631103158, "learning_rate": 7.691907577125064e-07, "loss": 9.37023141887039e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 463.5, "completions/min_length": 403.0, "epoch": 7.7441176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.8279851675033569, "kl": 0.010068721137940884, "learning_rate": 7.690826024433959e-07, "loss": 0.00010061284410767257, "reward": 0.8233333826065063, "reward_std": 0.14813123643398285, "rewards/DrugCombAccuracyCOTORM/mean": 0.800000011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.3265986442565918, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 5266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 474.75, "completions/min_length": 416.0, "epoch": 7.745588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9750169515609741, "kl": 0.009228222537785769, "learning_rate": 7.689744294478044e-07, "loss": 9.184330701828003e-05, "reward": 0.8767499923706055, "reward_std": 0.17010116577148438, "rewards/DrugCombAccuracyCOTORM/mean": 0.8537499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.31442803144454956, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.13437095284461975, "step": 5267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 489.125, "completions/min_length": 450.0, "epoch": 7.747058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.9333977699279785, "kl": 0.01034975005313754, "learning_rate": 7.688662387328582e-07, "loss": 0.00010362806642660871, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 5268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 455.375, "completions/min_length": 407.0, "epoch": 7.748529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.020112410187721252, "kl": 0.009857295546680689, "learning_rate": 7.687580303056844e-07, "loss": 9.867932385532185e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 497.625, "completions/min_length": 462.0, "epoch": 7.75, "frac_reward_zero_std": 0.5, "grad_norm": 1.143666386604309, "kl": 0.012088839197531343, "learning_rate": 7.68649804173412e-07, "loss": 0.00012041677109664306, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 466.8125, "completions/min_length": 381.0, "epoch": 7.751470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.2364336252212524, "kl": 0.009679695591330528, "learning_rate": 7.685415603431702e-07, "loss": 9.703636169433594e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 454.875, "completions/min_length": 398.0, "epoch": 7.752941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.2308118343353271, "kl": 0.009651865577325225, "learning_rate": 7.684332988220901e-07, "loss": 9.645149111747742e-05, "reward": 0.78125, "reward_std": 0.3743184804916382, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 5272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 467.6875, "completions/min_length": 415.0, "epoch": 7.754411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9553208947181702, "kl": 0.015701531432569027, "learning_rate": 7.683250196173035e-07, "loss": 0.00015663218800909817, "reward": 0.8208333253860474, "reward_std": 0.21744367480278015, "rewards/DrugCombAccuracyCOTORM/mean": 0.7916666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.4013864994049072, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 425.1875, "completions/min_length": 398.0, "epoch": 7.7558823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.011486602947115898, "kl": 0.008432100294157863, "learning_rate": 7.682167227359436e-07, "loss": 8.433415496256202e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 448.5625, "completions/min_length": 416.0, "epoch": 7.757352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.00846526212990284, "kl": 0.006954032811336219, "learning_rate": 7.681084081851449e-07, "loss": 6.958975427551195e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/mean_length": 471.0, "completions/min_length": 361.0, "epoch": 7.758823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.8191120028495789, "kl": 0.008555477834306657, "learning_rate": 7.680000759720427e-07, "loss": 8.471666660625488e-05, "reward": 0.965749979019165, "reward_std": 0.06507985293865204, "rewards/DrugCombAccuracyCOTORM/mean": 0.9597916603088379, "rewards/DrugCombAccuracyCOTORM/std": 0.11266776919364929, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.05692751333117485, "step": 5276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 420.125, "completions/min_length": 384.0, "epoch": 7.760294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.9375015497207642, "kl": 0.01097530941478908, "learning_rate": 7.678917261037738e-07, "loss": 0.00010913610458374023, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/mean_length": 484.4375, "completions/min_length": 423.0, "epoch": 7.761764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.2205839157104492, "kl": 0.009264624794013798, "learning_rate": 7.677833585874759e-07, "loss": 9.281188249588013e-05, "reward": 0.6395000219345093, "reward_std": 0.4144558012485504, "rewards/DrugCombAccuracyCOTORM/mean": 0.5728124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.5018232464790344, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.35939764976501465, "step": 5278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 453.6875, "completions/min_length": 412.0, "epoch": 7.7632352941176475, "frac_reward_zero_std": 0.5, "grad_norm": 1.2445755004882812, "kl": 0.008345548762008548, "learning_rate": 7.67674973430288e-07, "loss": 8.222233736887574e-05, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 5279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 457.0, "completions/min_length": 389.0, "epoch": 7.764705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.013912624679505825, "kl": 0.00760121492203325, "learning_rate": 7.675665706393501e-07, "loss": 7.542798994109035e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 407.625, "completions/min_length": 329.0, "epoch": 7.766176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.018439043313264847, "kl": 0.009648344712331891, "learning_rate": 7.674581502218038e-07, "loss": 9.653970482759178e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 494.5, "completions/min_length": 427.0, "epoch": 7.767647058823529, "frac_reward_zero_std": 0.0, "grad_norm": 1.3974437713623047, "kl": 0.013741003349423409, "learning_rate": 7.673497121847914e-07, "loss": 0.0001371242105960846, "reward": 0.3187499940395355, "reward_std": 0.3130674958229065, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.1875, "rewards/DrugCombCoverageCOTORM/std": 0.981070876121521, "step": 5282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 412.0, "completions/min_length": 357.0, "epoch": 7.769117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0129068773239851, "kl": 0.010213384637609124, "learning_rate": 7.672412565354567e-07, "loss": 0.00010200381802860647, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 5283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 433.6875, "completions/min_length": 391.0, "epoch": 7.770588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.027352390810847282, "kl": 0.008420294732786715, "learning_rate": 7.671327832809442e-07, "loss": 8.339651685673743e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 507.875, "completions/min_length": 458.0, "epoch": 7.772058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.011642325669527054, "kl": 0.007988482248038054, "learning_rate": 7.670242924283999e-07, "loss": 7.918353367131203e-05, "reward": 0.6713333129882812, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6100000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.40279027819633484, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 5285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 489.9375, "completions/min_length": 425.0, "epoch": 7.773529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.012242329306900501, "kl": 0.00841012701857835, "learning_rate": 7.669157839849709e-07, "loss": 8.418253855779767e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/mean_length": 508.25, "completions/min_length": 443.0, "epoch": 7.775, "frac_reward_zero_std": 0.5, "grad_norm": 0.8337640166282654, "kl": 0.011430363170802593, "learning_rate": 7.668072579578058e-07, "loss": 0.00011536478996276855, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 5287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 417.8125, "completions/min_length": 378.0, "epoch": 7.776470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.04545530304312706, "kl": 0.009356891503557563, "learning_rate": 7.666987143540535e-07, "loss": 9.415172826265916e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 463.375, "completions/min_length": 443.0, "epoch": 7.777941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 0.9409574866294861, "kl": 0.010881920461542904, "learning_rate": 7.66590153180865e-07, "loss": 0.0001090688310796395, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 487.6875, "completions/min_length": 414.0, "epoch": 7.779411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9666488170623779, "kl": 0.014392856508493423, "learning_rate": 7.664815744453917e-07, "loss": 0.00014486536383628845, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 493.4375, "completions/min_length": 427.0, "epoch": 7.780882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.9251697659492493, "kl": 0.010248188045807183, "learning_rate": 7.663729781547867e-07, "loss": 0.00010291325452271849, "reward": 0.4516666829586029, "reward_std": 0.051670487970113754, "rewards/DrugCombAccuracyCOTORM/mean": 0.44999998807907104, "rewards/DrugCombAccuracyCOTORM/std": 0.47046077251434326, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.08333331346511841, "rewards/DrugCombCoverageCOTORM/std": 0.9545214176177979, "step": 5291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 456.9375, "completions/min_length": 403.0, "epoch": 7.7823529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.02248508110642433, "kl": 0.011538193095475435, "learning_rate": 7.662643643162042e-07, "loss": 0.00011444713891251013, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 427.1875, "completions/min_length": 346.0, "epoch": 7.783823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.4442332983016968, "kl": 0.012083288049325347, "learning_rate": 7.66155732936799e-07, "loss": 0.00012074410915374756, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 5293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/mean_length": 480.0, "completions/min_length": 365.0, "epoch": 7.785294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.8285504579544067, "kl": 0.011772742494940758, "learning_rate": 7.660470840237277e-07, "loss": 0.00011808804993052036, "reward": 0.8166667222976685, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.7708333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.3381595313549042, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 451.9375, "completions/min_length": 392.0, "epoch": 7.786764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.018538709729909897, "kl": 0.009743831469677389, "learning_rate": 7.659384175841479e-07, "loss": 9.699238580651581e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 485.0, "completions/min_length": 380.0, "epoch": 7.788235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0106995105743408, "kl": 0.010283331503160298, "learning_rate": 7.65829733625218e-07, "loss": 0.00010336562991142273, "reward": 0.8458333611488342, "reward_std": 0.06561672687530518, "rewards/DrugCombAccuracyCOTORM/mean": 0.8229166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.21489661931991577, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.22360680997371674, "step": 5296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 471.5, "completions/min_length": 319.0, "epoch": 7.7897058823529415, "frac_reward_zero_std": 0.5, "grad_norm": 1.076123595237732, "kl": 0.01031832117587328, "learning_rate": 7.657210321540982e-07, "loss": 0.000103045254945755, "reward": 0.8047410845756531, "reward_std": 0.17675252258777618, "rewards/DrugCombAccuracyCOTORM/mean": 0.7617857456207275, "rewards/DrugCombAccuracyCOTORM/std": 0.392558753490448, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.953125, "rewards/DrugCombCoverageCOTORM/std": 0.10077822208404541, "step": 5297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 474.0625, "completions/min_length": 413.0, "epoch": 7.791176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.011672232300043106, "kl": 0.009365696925669909, "learning_rate": 7.656123131779489e-07, "loss": 9.47638473007828e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 478.4375, "completions/min_length": 426.0, "epoch": 7.79264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9639102220535278, "kl": 0.012579856673255563, "learning_rate": 7.655035767039328e-07, "loss": 0.0001262826845049858, "reward": 0.5737708210945129, "reward_std": 0.01578499749302864, "rewards/DrugCombAccuracyCOTORM/mean": 0.51604163646698, "rewards/DrugCombAccuracyCOTORM/std": 0.501519501209259, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.609375, "rewards/DrugCombCoverageCOTORM/std": 0.4856446385383606, "step": 5299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 435.0, "completions/min_length": 406.0, "epoch": 7.794117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.9589687585830688, "kl": 0.0077965621603652835, "learning_rate": 7.653948227392129e-07, "loss": 7.782196917105466e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 444.5625, "completions/min_length": 395.0, "epoch": 7.795588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.010153696872293949, "kl": 0.008466529776342213, "learning_rate": 7.652860512909539e-07, "loss": 8.395182521780953e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/mean_length": 498.5, "completions/min_length": 372.0, "epoch": 7.797058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.7772789001464844, "kl": 0.009582679253071547, "learning_rate": 7.651772623663211e-07, "loss": 9.463727474212646e-05, "reward": 0.6933416724205017, "reward_std": 0.12475486844778061, "rewards/DrugCombAccuracyCOTORM/mean": 0.6388124823570251, "rewards/DrugCombAccuracyCOTORM/std": 0.4235530495643616, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8229166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.21489661931991577, "step": 5302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 440.1875, "completions/min_length": 373.0, "epoch": 7.798529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9377472996711731, "kl": 0.010252010310068727, "learning_rate": 7.650684559724814e-07, "loss": 0.00010156883217860013, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 5303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 483.875, "completions/min_length": 396.0, "epoch": 7.8, "frac_reward_zero_std": 1.0, "grad_norm": 0.007999120280146599, "kl": 0.006867548916488886, "learning_rate": 7.649596321166024e-07, "loss": 6.861629663035274e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 478.875, "completions/min_length": 431.0, "epoch": 7.801470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.03426387533545494, "kl": 0.007303040474653244, "learning_rate": 7.648507908058535e-07, "loss": 7.303811435122043e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 434.9375, "completions/min_length": 389.0, "epoch": 7.802941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.009129188023507595, "kl": 0.008202616823837161, "learning_rate": 7.647419320474046e-07, "loss": 8.272462582681328e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 430.5625, "completions/min_length": 394.0, "epoch": 7.804411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.9250363707542419, "kl": 0.011766692390665412, "learning_rate": 7.646330558484275e-07, "loss": 0.00011716783046722412, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 5307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 475.1875, "completions/min_length": 410.0, "epoch": 7.805882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.1048504114151, "kl": 0.009840973420068622, "learning_rate": 7.645241622160943e-07, "loss": 9.759798558661714e-05, "reward": 0.567187488079071, "reward_std": 0.17537428438663483, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.1875, "rewards/DrugCombCoverageCOTORM/std": 0.981070876121521, "step": 5308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 477.0, "completions/min_length": 417.0, "epoch": 7.807352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.7278992533683777, "kl": 0.010867430130019784, "learning_rate": 7.644152511575786e-07, "loss": 0.00010841339826583862, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 451.0, "completions/min_length": 357.0, "epoch": 7.8088235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 1.0605882406234741, "kl": 0.009700741851702332, "learning_rate": 7.643063226800554e-07, "loss": 9.65389481279999e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 418.8125, "completions/min_length": 372.0, "epoch": 7.810294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.7635983228683472, "kl": 0.011126177036203444, "learning_rate": 7.641973767907004e-07, "loss": 0.00011346489191055298, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 455.25, "completions/min_length": 400.0, "epoch": 7.811764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.013286929577589035, "kl": 0.008914540405385196, "learning_rate": 7.640884134966909e-07, "loss": 8.833614992909133e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 483.4375, "completions/min_length": 435.0, "epoch": 7.813235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.016185134649276733, "kl": 0.00910059199668467, "learning_rate": 7.639794328052051e-07, "loss": 9.123276686295867e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 458.5625, "completions/min_length": 393.0, "epoch": 7.814705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.013254253193736076, "kl": 0.00973656156565994, "learning_rate": 7.638704347234224e-07, "loss": 9.699034126242623e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 482.0625, "completions/min_length": 363.0, "epoch": 7.8161764705882355, "frac_reward_zero_std": 0.0, "grad_norm": 1.224936604499817, "kl": 0.011031179805286229, "learning_rate": 7.637614192585232e-07, "loss": 0.00011052936315536499, "reward": 0.6227083206176758, "reward_std": 0.2187248170375824, "rewards/DrugCombAccuracyCOTORM/mean": 0.5479166507720947, "rewards/DrugCombAccuracyCOTORM/std": 0.43840938806533813, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.18726837635040283, "step": 5315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 435.3125, "completions/min_length": 397.0, "epoch": 7.817647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.9305585026741028, "kl": 0.012431521550752223, "learning_rate": 7.63652386417689e-07, "loss": 0.0001233033835887909, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 5316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 403.875, "completions/min_length": 364.0, "epoch": 7.819117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.023988505825400352, "kl": 0.011039134114980698, "learning_rate": 7.635433362081028e-07, "loss": 0.00011072486086050048, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 452.3125, "completions/min_length": 331.0, "epoch": 7.820588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.8486126065254211, "kl": 0.011737346416339278, "learning_rate": 7.634342686369487e-07, "loss": 0.0001166369765996933, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 462.3125, "completions/min_length": 408.0, "epoch": 7.822058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.498164176940918, "kl": 0.011109278770163655, "learning_rate": 7.633251837114117e-07, "loss": 0.00011112913489341736, "reward": 0.7437499761581421, "reward_std": 0.3729080259799957, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 473.0625, "completions/min_length": 410.0, "epoch": 7.823529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8231809139251709, "kl": 0.008457451243884861, "learning_rate": 7.632160814386779e-07, "loss": 8.416115451836959e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 454.875, "completions/min_length": 406.0, "epoch": 7.825, "frac_reward_zero_std": 1.0, "grad_norm": 0.011421343311667442, "kl": 0.008100697305053473, "learning_rate": 7.631069618259347e-07, "loss": 8.114956290228292e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/mean_length": 470.0, "completions/min_length": 414.0, "epoch": 7.826470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9175124764442444, "kl": 0.009265991626307368, "learning_rate": 7.629978248803708e-07, "loss": 9.272331953980029e-05, "reward": 0.9551249742507935, "reward_std": 0.12692566215991974, "rewards/DrugCombAccuracyCOTORM/mean": 0.9478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.20874999463558197, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 5322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 405.0625, "completions/min_length": 359.0, "epoch": 7.827941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.013915237970650196, "kl": 0.009903307305648923, "learning_rate": 7.628886706091757e-07, "loss": 9.773668716661632e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 465.3125, "completions/min_length": 427.0, "epoch": 7.829411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.4027512073516846, "kl": 0.011972251813858747, "learning_rate": 7.627794990195403e-07, "loss": 0.0001189892936963588, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 397.25, "completions/min_length": 344.0, "epoch": 7.830882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.015587442554533482, "kl": 0.009796799859032035, "learning_rate": 7.626703101186567e-07, "loss": 9.818615217227489e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 413.0, "completions/min_length": 330.0, "epoch": 7.83235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.012280999682843685, "kl": 0.007235804223455489, "learning_rate": 7.625611039137176e-07, "loss": 7.25819991203025e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 408.25, "completions/min_length": 347.0, "epoch": 7.833823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.014966974034905434, "kl": 0.010829577455297112, "learning_rate": 7.624518804119174e-07, "loss": 0.00010843632480828092, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 490.5, "completions/min_length": 419.0, "epoch": 7.8352941176470585, "frac_reward_zero_std": 1.0, "grad_norm": 0.016667408868670464, "kl": 0.009305657353252172, "learning_rate": 7.623426396204515e-07, "loss": 9.256639168597758e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 5328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/mean_length": 509.5, "completions/min_length": 408.0, "epoch": 7.836764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9748742580413818, "kl": 0.011445908341556787, "learning_rate": 7.622333815465165e-07, "loss": 0.00011440963862696663, "reward": 0.906833291053772, "reward_std": 0.17410697042942047, "rewards/DrugCombAccuracyCOTORM/mean": 0.8887500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.30663496255874634, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 5329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 495.3125, "completions/min_length": 438.0, "epoch": 7.838235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9749482274055481, "kl": 0.009543525287881494, "learning_rate": 7.621241061973099e-07, "loss": 9.519560990156606e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/mean_length": 506.8125, "completions/min_length": 426.0, "epoch": 7.839705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9647530317306519, "kl": 0.011981264688074589, "learning_rate": 7.620148135800306e-07, "loss": 0.00011940500553464517, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 5331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 473.375, "completions/min_length": 433.0, "epoch": 7.841176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 1.5994468927383423, "kl": 0.011609692126512527, "learning_rate": 7.619055037018784e-07, "loss": 0.0001157522201538086, "reward": 0.8312499523162842, "reward_std": 0.36740854382514954, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 5332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 495.1875, "completions/min_length": 445.0, "epoch": 7.8426470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.7873079180717468, "kl": 0.010833356878720224, "learning_rate": 7.617961765700545e-07, "loss": 0.00010816752910614014, "reward": 0.6937500238418579, "reward_std": 0.2541618049144745, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.8920949101448059, "step": 5333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 441.9375, "completions/min_length": 386.0, "epoch": 7.844117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.01146123930811882, "kl": 0.008979208185337484, "learning_rate": 7.616868321917609e-07, "loss": 8.951575728133321e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/mean_length": 396.0, "completions/min_length": 318.0, "epoch": 7.845588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.009254878386855125, "kl": 0.006907802657224238, "learning_rate": 7.61577470574201e-07, "loss": 6.94652262609452e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/mean_length": 492.3125, "completions/min_length": 379.0, "epoch": 7.847058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 0.9131712317466736, "kl": 0.00965929834637791, "learning_rate": 7.614680917245795e-07, "loss": 9.644031524658203e-05, "reward": 0.8500000238418579, "reward_std": 0.141421377658844, "rewards/DrugCombAccuracyCOTORM/mean": 0.828125, "rewards/DrugCombAccuracyCOTORM/std": 0.25361964106559753, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 450.625, "completions/min_length": 402.0, "epoch": 7.848529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.784375011920929, "kl": 0.008256244589574635, "learning_rate": 7.613586956501017e-07, "loss": 8.08963377494365e-05, "reward": 0.6213333010673523, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.5475000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.41562002897262573, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 5337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/mean_length": 501.9375, "completions/min_length": 369.0, "epoch": 7.85, "frac_reward_zero_std": 0.5, "grad_norm": 0.6172019243240356, "kl": 0.007007046602666378, "learning_rate": 7.612492823579744e-07, "loss": 7.078051567077637e-05, "reward": 0.7649500370025635, "reward_std": 0.21552200615406036, "rewards/DrugCombAccuracyCOTORM/mean": 0.7249374985694885, "rewards/DrugCombAccuracyCOTORM/std": 0.44349005818367004, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8500000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.4979960024356842, "step": 5338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 438.4375, "completions/min_length": 369.0, "epoch": 7.851470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.012478742748498917, "kl": 0.010425918037071824, "learning_rate": 7.611398518554055e-07, "loss": 0.00010472546273376793, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/mean_length": 494.9375, "completions/min_length": 368.0, "epoch": 7.852941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.3269784450531006, "kl": 0.009447874035686255, "learning_rate": 7.610304041496041e-07, "loss": 9.42423939704895e-05, "reward": 0.5, "reward_std": 0.3187023997306824, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.4654746949672699, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/mean_length": 516.9375, "completions/min_length": 441.0, "epoch": 7.854411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.1612738370895386, "kl": 0.011629271553829312, "learning_rate": 7.609209392477803e-07, "loss": 0.00011671334505081177, "reward": 0.5215466022491455, "reward_std": 0.17752721905708313, "rewards/DrugCombAccuracyCOTORM/mean": 0.419901967048645, "rewards/DrugCombAccuracyCOTORM/std": 0.49632391333580017, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.856249988079071, "rewards/DrugCombCoverageCOTORM/std": 0.5006245970726013, "step": 5341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 420.0625, "completions/min_length": 366.0, "epoch": 7.855882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.1696362495422363, "kl": 0.012133081792853773, "learning_rate": 7.608114571571453e-07, "loss": 0.00012020363647025079, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 438.375, "completions/min_length": 347.0, "epoch": 7.857352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.014838515780866146, "kl": 0.009824619628489017, "learning_rate": 7.607019578849115e-07, "loss": 9.76016599452123e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/mean_length": 407.875, "completions/min_length": 337.0, "epoch": 7.858823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.01388317346572876, "kl": 0.011263814521953464, "learning_rate": 7.605924414382925e-07, "loss": 0.00011320530029479414, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 481.625, "completions/min_length": 399.0, "epoch": 7.860294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9337412714958191, "kl": 0.009703985182568431, "learning_rate": 7.604829078245029e-07, "loss": 9.64142382144928e-05, "reward": 0.7945833206176758, "reward_std": 0.17010116577148438, "rewards/DrugCombAccuracyCOTORM/mean": 0.7562500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.3733965754508972, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.15957117080688477, "step": 5345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 458.125, "completions/min_length": 379.0, "epoch": 7.8617647058823525, "frac_reward_zero_std": 1.0, "grad_norm": 0.010067419148981571, "kl": 0.009096520952880383, "learning_rate": 7.603733570507585e-07, "loss": 9.13370749913156e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 453.25, "completions/min_length": 385.0, "epoch": 7.863235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.017163781449198723, "kl": 0.009991647209972143, "learning_rate": 7.602637891242763e-07, "loss": 0.00010029758414020762, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 472.8125, "completions/min_length": 434.0, "epoch": 7.864705882352942, "frac_reward_zero_std": 0.0, "grad_norm": 1.0075713396072388, "kl": 0.006900366162881255, "learning_rate": 7.601542040522745e-07, "loss": 6.89476728439331e-05, "reward": 0.8988749980926514, "reward_std": 0.28602468967437744, "rewards/DrugCombAccuracyCOTORM/mean": 0.8853124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.314830482006073, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.2719528079032898, "step": 5348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 463.375, "completions/min_length": 380.0, "epoch": 7.866176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.1775696277618408, "kl": 0.014471412869170308, "learning_rate": 7.60044601841972e-07, "loss": 0.0001423656940460205, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 5349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 782.0, "completions/mean_length": 589.5625, "completions/min_length": 480.0, "epoch": 7.867647058823529, "frac_reward_zero_std": 0.0, "grad_norm": 1.5664514303207397, "kl": 0.01305323583073914, "learning_rate": 7.59934982500589e-07, "loss": 0.00012826919555664062, "reward": 0.7080063819885254, "reward_std": 0.17591902613639832, "rewards/DrugCombAccuracyCOTORM/mean": 0.6555809378623962, "rewards/DrugCombAccuracyCOTORM/std": 0.3798685669898987, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8354166746139526, "rewards/DrugCombCoverageCOTORM/std": 0.3216206133365631, "step": 5350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/mean_length": 523.125, "completions/min_length": 416.0, "epoch": 7.8691176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.597517728805542, "kl": 0.010757628711871803, "learning_rate": 7.598253460353474e-07, "loss": 0.00010845442011486739, "reward": 0.8333333730697632, "reward_std": 0.0942809209227562, "rewards/DrugCombAccuracyCOTORM/mean": 0.7916666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.2687419056892395, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 488.1875, "completions/min_length": 393.0, "epoch": 7.870588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9512055516242981, "kl": 0.01011008070781827, "learning_rate": 7.597156924534693e-07, "loss": 0.000100640972959809, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 5352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 457.8125, "completions/min_length": 396.0, "epoch": 7.872058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.9587008953094482, "kl": 0.009394191671162844, "learning_rate": 7.596060217621789e-07, "loss": 9.422805305803195e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 452.3125, "completions/min_length": 418.0, "epoch": 7.873529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.013068224303424358, "kl": 0.008843760471791029, "learning_rate": 7.594963339687008e-07, "loss": 8.92519747139886e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 466.6875, "completions/min_length": 419.0, "epoch": 7.875, "frac_reward_zero_std": 0.5, "grad_norm": 0.9920483231544495, "kl": 0.01050185109488666, "learning_rate": 7.593866290802608e-07, "loss": 0.00010488549014553428, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 5355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 425.0, "completions/min_length": 385.0, "epoch": 7.876470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.9915380477905273, "kl": 0.012131477938964963, "learning_rate": 7.59276907104086e-07, "loss": 0.0001214742660522461, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 492.75, "completions/min_length": 390.0, "epoch": 7.877941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.248245120048523, "kl": 0.013019869918935001, "learning_rate": 7.591671680474048e-07, "loss": 0.00013158097863197327, "reward": 0.375, "reward_std": 0.3246464431285858, "rewards/DrugCombAccuracyCOTORM/mean": 0.21875, "rewards/DrugCombAccuracyCOTORM/std": 0.4069705307483673, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 442.75, "completions/min_length": 402.0, "epoch": 7.879411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.01609622687101364, "kl": 0.009272928116843104, "learning_rate": 7.590574119174464e-07, "loss": 9.264041727874428e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 430.4375, "completions/min_length": 366.0, "epoch": 7.8808823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.844336748123169, "kl": 0.010404545348137617, "learning_rate": 7.589476387214413e-07, "loss": 0.00010364882473368198, "reward": 0.9178333282470703, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.9025000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.26642072200775146, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 5359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/mean_length": 511.25, "completions/min_length": 445.0, "epoch": 7.882352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.1805527210235596, "kl": 0.008697791141457856, "learning_rate": 7.588378484666213e-07, "loss": 8.744001388549805e-05, "reward": 0.53125, "reward_std": 0.1944543570280075, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5439056158065796, "step": 5360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 479.9375, "completions/min_length": 375.0, "epoch": 7.883823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.6647815108299255, "kl": 0.01816066517494619, "learning_rate": 7.587280411602186e-07, "loss": 0.00018056273984257132, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 481.9375, "completions/min_length": 431.0, "epoch": 7.885294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.990302562713623, "kl": 0.011237278347834945, "learning_rate": 7.586182168094675e-07, "loss": 0.00011260948667768389, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 418.0, "completions/min_length": 350.0, "epoch": 7.886764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.3196765184402466, "kl": 0.015616007149219513, "learning_rate": 7.585083754216026e-07, "loss": 0.00015628358232788742, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 426.6875, "completions/min_length": 388.0, "epoch": 7.8882352941176475, "frac_reward_zero_std": 1.0, "grad_norm": 0.019120987504720688, "kl": 0.011545853689312935, "learning_rate": 7.583985170038602e-07, "loss": 0.0001162120170192793, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 480.9375, "completions/min_length": 445.0, "epoch": 7.889705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.8084040880203247, "kl": 0.009151766076683998, "learning_rate": 7.582886415634772e-07, "loss": 9.176135063171387e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 443.1875, "completions/min_length": 397.0, "epoch": 7.891176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.014137941412627697, "kl": 0.009012621361762285, "learning_rate": 7.581787491076925e-07, "loss": 9.035668335855007e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 473.6875, "completions/min_length": 415.0, "epoch": 7.892647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.7787566184997559, "kl": 0.007714049657806754, "learning_rate": 7.580688396437452e-07, "loss": 7.619708776473999e-05, "reward": 0.5551249980926514, "reward_std": 0.12692566215991974, "rewards/DrugCombAccuracyCOTORM/mean": 0.4478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.5045558214187622, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 5367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 430.9375, "completions/min_length": 393.0, "epoch": 7.894117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 1.543348789215088, "kl": 0.011350814485922456, "learning_rate": 7.579589131788755e-07, "loss": 0.00011279433965682983, "reward": 0.887499988079071, "reward_std": 0.318198025226593, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/mean_length": 503.875, "completions/min_length": 415.0, "epoch": 7.895588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9484328031539917, "kl": 0.010007149539887905, "learning_rate": 7.578489697203257e-07, "loss": 0.0001009330153465271, "reward": 0.6687500476837158, "reward_std": 0.16605544090270996, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.45338237285614014, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.6020797491073608, "step": 5369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 469.8125, "completions/min_length": 387.0, "epoch": 7.897058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.1161413192749023, "kl": 0.013579455902799964, "learning_rate": 7.57739009275338e-07, "loss": 0.00013560970546677709, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 5370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 440.125, "completions/min_length": 355.0, "epoch": 7.898529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9260412454605103, "kl": 0.01650905259884894, "learning_rate": 7.576290318511569e-07, "loss": 0.00016613818297628313, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/mean_length": 476.0625, "completions/min_length": 390.0, "epoch": 7.9, "frac_reward_zero_std": 0.5, "grad_norm": 0.905815064907074, "kl": 0.009174906997941434, "learning_rate": 7.575190374550271e-07, "loss": 9.235739707946777e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 423.5, "completions/min_length": 381.0, "epoch": 7.901470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.011256244964897633, "kl": 0.008851231075823307, "learning_rate": 7.574090260941948e-07, "loss": 8.82741151144728e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 453.8125, "completions/min_length": 361.0, "epoch": 7.902941176470589, "frac_reward_zero_std": 0.0, "grad_norm": 1.4353344440460205, "kl": 0.013319944264367223, "learning_rate": 7.572989977759073e-07, "loss": 0.00013480708003044128, "reward": 0.7365208864212036, "reward_std": 0.17722100019454956, "rewards/DrugCombAccuracyCOTORM/mean": 0.6914843916893005, "rewards/DrugCombAccuracyCOTORM/std": 0.27914002537727356, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.1983730047941208, "step": 5374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 474.5625, "completions/min_length": 410.0, "epoch": 7.904411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.1099364757537842, "kl": 0.015565617708489299, "learning_rate": 7.571889525074128e-07, "loss": 0.00015750527381896973, "reward": 0.4437500238418579, "reward_std": 0.3727162480354309, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.7274384498596191, "step": 5375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 426.0, "completions/min_length": 366.0, "epoch": 7.905882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.1858114004135132, "kl": 0.012497816933318973, "learning_rate": 7.570788902959611e-07, "loss": 0.00012533366680145264, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 434.625, "completions/min_length": 365.0, "epoch": 7.9073529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.008657258003950119, "kl": 0.005933524458669126, "learning_rate": 7.569688111488029e-07, "loss": 5.914763460168615e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 477.8125, "completions/min_length": 387.0, "epoch": 7.908823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.9506305456161499, "kl": 0.009707659250125289, "learning_rate": 7.568587150731895e-07, "loss": 9.719282388687134e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 421.0, "completions/min_length": 375.0, "epoch": 7.910294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.01633133925497532, "kl": 0.007773911929689348, "learning_rate": 7.56748602076374e-07, "loss": 7.754102989565581e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 432.9375, "completions/min_length": 411.0, "epoch": 7.911764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.762751042842865, "kl": 0.007847946020774543, "learning_rate": 7.566384721656103e-07, "loss": 7.826089859008789e-05, "reward": 0.9937499761581421, "reward_std": 0.017677659168839455, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/mean_length": 500.875, "completions/min_length": 427.0, "epoch": 7.913235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.6617830991744995, "kl": 0.011860095662996173, "learning_rate": 7.565283253481535e-07, "loss": 0.00011865794658660889, "reward": 0.887499988079071, "reward_std": 0.3181980550289154, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 5381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 439.5625, "completions/min_length": 404.0, "epoch": 7.9147058823529415, "frac_reward_zero_std": 0.5, "grad_norm": 0.8860872983932495, "kl": 0.013232998549938202, "learning_rate": 7.5641816163126e-07, "loss": 0.0001324716431554407, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 435.875, "completions/min_length": 382.0, "epoch": 7.916176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.010890306904911995, "kl": 0.00766062259208411, "learning_rate": 7.563079810221869e-07, "loss": 7.526862464146689e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 445.8125, "completions/min_length": 358.0, "epoch": 7.91764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0263265371322632, "kl": 0.00965038686990738, "learning_rate": 7.561977835281926e-07, "loss": 9.720027446746826e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 474.5, "completions/min_length": 359.0, "epoch": 7.919117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.110542893409729, "kl": 0.009047883795574307, "learning_rate": 7.560875691565365e-07, "loss": 9.046840568771586e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 432.125, "completions/min_length": 376.0, "epoch": 7.920588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.016812993213534355, "kl": 0.008001632755622268, "learning_rate": 7.559773379144796e-07, "loss": 8.011102181626484e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 438.0625, "completions/min_length": 395.0, "epoch": 7.922058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.015934284776449203, "kl": 0.010991309769451618, "learning_rate": 7.558670898092834e-07, "loss": 0.00010995531920343637, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 455.0, "completions/min_length": 406.0, "epoch": 7.923529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.05464727059006691, "kl": 0.01130294892936945, "learning_rate": 7.557568248482109e-07, "loss": 0.00011314186122035608, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 486.125, "completions/min_length": 401.0, "epoch": 7.925, "frac_reward_zero_std": 0.5, "grad_norm": 0.9431593418121338, "kl": 0.01121605234220624, "learning_rate": 7.556465430385259e-07, "loss": 0.00011101397831225768, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 435.8125, "completions/min_length": 393.0, "epoch": 7.926470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.01438708882778883, "kl": 0.009326313389465213, "learning_rate": 7.555362443874938e-07, "loss": 9.399370173923671e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 412.4375, "completions/min_length": 369.0, "epoch": 7.927941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.023461611941456795, "kl": 0.011183380614966154, "learning_rate": 7.554259289023805e-07, "loss": 0.00011225233174627647, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 426.375, "completions/min_length": 401.0, "epoch": 7.929411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.021929798647761345, "kl": 0.01192186656408012, "learning_rate": 7.553155965904534e-07, "loss": 0.00011958775576204062, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 460.9375, "completions/min_length": 414.0, "epoch": 7.930882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.01259535364806652, "kl": 0.009467251831665635, "learning_rate": 7.55205247458981e-07, "loss": 9.396125096827745e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 454.375, "completions/min_length": 380.0, "epoch": 7.932352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.1112691164016724, "kl": 0.011941192671656609, "learning_rate": 7.550948815152327e-07, "loss": 0.00011935557995457202, "reward": 0.824999988079071, "reward_std": 0.24201533198356628, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.5773502588272095, "step": 5394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 417.25, "completions/min_length": 381.0, "epoch": 7.9338235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.011481168679893017, "kl": 0.00894302362576127, "learning_rate": 7.549844987664791e-07, "loss": 8.930263720685616e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 445.3125, "completions/min_length": 395.0, "epoch": 7.935294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.8799375891685486, "kl": 0.008071601390838623, "learning_rate": 7.548740992199922e-07, "loss": 8.032201731111854e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 457.875, "completions/min_length": 377.0, "epoch": 7.936764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.4286125898361206, "kl": 0.014995220815762877, "learning_rate": 7.547636828830444e-07, "loss": 0.00014910846948623657, "reward": 0.706250011920929, "reward_std": 0.4643521308898926, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.7274384498596191, "step": 5397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 464.5625, "completions/min_length": 437.0, "epoch": 7.938235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.018266011029481888, "kl": 0.009877594653517008, "learning_rate": 7.5465324976291e-07, "loss": 9.847064211498946e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 458.3125, "completions/min_length": 373.0, "epoch": 7.939705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.010623199865221977, "kl": 0.009101010160520673, "learning_rate": 7.54542799866864e-07, "loss": 9.12263712962158e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 497.5, "completions/min_length": 458.0, "epoch": 7.9411764705882355, "frac_reward_zero_std": 0.5, "grad_norm": 0.9483343362808228, "kl": 0.012431044364348054, "learning_rate": 7.544323332021824e-07, "loss": 0.00012538954615592957, "reward": 0.8052083253860474, "reward_std": 0.23345819115638733, "rewards/DrugCombAccuracyCOTORM/mean": 0.7916666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.4013864994049072, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.71875, "rewards/DrugCombCoverageCOTORM/std": 0.682367205619812, "step": 5400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 472.1875, "completions/min_length": 389.0, "epoch": 7.942647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.1316310167312622, "kl": 0.015379783697426319, "learning_rate": 7.543218497761426e-07, "loss": 0.0001526474952697754, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 5401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 427.6875, "completions/min_length": 374.0, "epoch": 7.944117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.00976445060223341, "kl": 0.0083095250884071, "learning_rate": 7.542113495960232e-07, "loss": 8.281330519821495e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 486.5625, "completions/min_length": 422.0, "epoch": 7.945588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.982615053653717, "kl": 0.009928840212523937, "learning_rate": 7.541008326691032e-07, "loss": 9.969143138732761e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 422.4375, "completions/min_length": 360.0, "epoch": 7.947058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.01824743114411831, "kl": 0.009248450049199164, "learning_rate": 7.539902990026634e-07, "loss": 9.197550389217213e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 485.5, "completions/min_length": 433.0, "epoch": 7.948529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.5656437873840332, "kl": 0.01450475282035768, "learning_rate": 7.538797486039854e-07, "loss": 0.00014549493789672852, "reward": 0.8583333492279053, "reward_std": 0.24752496182918549, "rewards/DrugCombAccuracyCOTORM/mean": 0.8541666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.3435921370983124, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 5405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/mean_length": 539.0625, "completions/min_length": 479.0, "epoch": 7.95, "frac_reward_zero_std": 0.0, "grad_norm": 1.2044012546539307, "kl": 0.010752453003078699, "learning_rate": 7.537691814803521e-07, "loss": 0.0001074671745300293, "reward": 0.4012083411216736, "reward_std": 0.4216281473636627, "rewards/DrugCombAccuracyCOTORM/mean": 0.32625001668930054, "rewards/DrugCombAccuracyCOTORM/std": 0.47225522994995117, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4020833373069763, "rewards/DrugCombCoverageCOTORM/std": 0.8529225587844849, "step": 5406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/mean_length": 564.4375, "completions/min_length": 499.0, "epoch": 7.951470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8267369270324707, "kl": 0.011162630980834365, "learning_rate": 7.536585976390472e-07, "loss": 0.00011191752855665982, "reward": 0.2862968444824219, "reward_std": 0.11766767501831055, "rewards/DrugCombAccuracyCOTORM/mean": 0.20335713028907776, "rewards/DrugCombAccuracyCOTORM/std": 0.28601470589637756, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.236111119389534, "rewards/DrugCombCoverageCOTORM/std": 0.25939151644706726, "step": 5407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 478.3125, "completions/min_length": 428.0, "epoch": 7.952941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.012033089064061642, "kl": 0.008483470999635756, "learning_rate": 7.535479970873562e-07, "loss": 8.506821177434176e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 496.25, "completions/min_length": 442.0, "epoch": 7.954411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.3378186225891113, "kl": 0.018372468650341034, "learning_rate": 7.534373798325645e-07, "loss": 0.00018274784088134766, "reward": 0.550000011920929, "reward_std": 0.41661903262138367, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 5409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 460.6875, "completions/min_length": 385.0, "epoch": 7.955882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.355993628501892, "kl": 0.013588553760200739, "learning_rate": 7.533267458819597e-07, "loss": 0.00013686713646166027, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 433.1875, "completions/min_length": 369.0, "epoch": 7.95735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.031959835439920425, "kl": 0.013198171043768525, "learning_rate": 7.532160952428299e-07, "loss": 0.00013282100553624332, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/mean_length": 488.9375, "completions/min_length": 405.0, "epoch": 7.958823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.012757393531501293, "kl": 0.009385997196659446, "learning_rate": 7.531054279224645e-07, "loss": 9.45949723245576e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 434.125, "completions/min_length": 396.0, "epoch": 7.9602941176470585, "frac_reward_zero_std": 0.5, "grad_norm": 1.1368008852005005, "kl": 0.009465704904869199, "learning_rate": 7.529947439281541e-07, "loss": 9.47713851928711e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 418.125, "completions/min_length": 356.0, "epoch": 7.961764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.012171873822808266, "kl": 0.008977471967227757, "learning_rate": 7.528840432671904e-07, "loss": 8.8970446086023e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 475.625, "completions/min_length": 410.0, "epoch": 7.963235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8543583750724792, "kl": 0.008425551932305098, "learning_rate": 7.527733259468658e-07, "loss": 8.387380512431264e-05, "reward": 0.5625, "reward_std": 0.0353553406894207, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.6191391944885254, "step": 5415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 426.75, "completions/min_length": 364.0, "epoch": 7.964705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.11073637008667, "kl": 0.012772886781021953, "learning_rate": 7.52662591974474e-07, "loss": 0.00012833625078201294, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 479.125, "completions/min_length": 427.0, "epoch": 7.966176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 1.285287857055664, "kl": 0.009871894028037786, "learning_rate": 7.525518413573102e-07, "loss": 9.872019290924072e-05, "reward": 0.637499988079071, "reward_std": 0.4001959264278412, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 5417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 461.375, "completions/min_length": 384.0, "epoch": 7.9676470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.011914120055735111, "kl": 0.008342182962223887, "learning_rate": 7.524410741026701e-07, "loss": 8.436110510956496e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/mean_length": 477.1875, "completions/min_length": 416.0, "epoch": 7.969117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.8221781253814697, "kl": 0.012427214765921235, "learning_rate": 7.52330290217851e-07, "loss": 0.0001239180564880371, "reward": 0.5588333010673523, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.48500001430511475, "rewards/DrugCombAccuracyCOTORM/std": 0.4182184636592865, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7083333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.3191423714160919, "step": 5419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 440.0625, "completions/min_length": 403.0, "epoch": 7.970588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.8382337093353271, "kl": 0.011825668858364224, "learning_rate": 7.52219489710151e-07, "loss": 0.00011849403381347656, "reward": 0.8500000238418579, "reward_std": 0.2070196568965912, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 445.875, "completions/min_length": 371.0, "epoch": 7.972058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 1.0280665159225464, "kl": 0.011232783552259207, "learning_rate": 7.521086725868693e-07, "loss": 0.00011089444160461426, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 458.5, "completions/min_length": 423.0, "epoch": 7.973529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.496724247932434, "kl": 0.014496145769953728, "learning_rate": 7.519978388553061e-07, "loss": 0.00014498084783554077, "reward": 0.5138332843780518, "reward_std": 0.26417669653892517, "rewards/DrugCombAccuracyCOTORM/mean": 0.4391666650772095, "rewards/DrugCombAccuracyCOTORM/std": 0.456744521856308, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.4013864994049072, "step": 5422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 438.75, "completions/min_length": 358.0, "epoch": 7.975, "frac_reward_zero_std": 0.5, "grad_norm": 0.9758461117744446, "kl": 0.014101045904681087, "learning_rate": 7.518869885227631e-07, "loss": 0.0001413316058460623, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 5423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 432.625, "completions/min_length": 369.0, "epoch": 7.976470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.016772301867604256, "kl": 0.00865394459106028, "learning_rate": 7.517761215965428e-07, "loss": 8.596535190008581e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/mean_length": 473.25, "completions/min_length": 352.0, "epoch": 7.977941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9606055617332458, "kl": 0.012009783647954464, "learning_rate": 7.516652380839486e-07, "loss": 0.00011978062684647739, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 467.25, "completions/min_length": 407.0, "epoch": 7.979411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.01209381502121687, "kl": 0.009339557262137532, "learning_rate": 7.515543379922857e-07, "loss": 9.28469598875381e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 434.5, "completions/min_length": 397.0, "epoch": 7.980882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.8621386289596558, "kl": 0.009252255549654365, "learning_rate": 7.514434213288594e-07, "loss": 9.22083854675293e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 452.75, "completions/min_length": 388.0, "epoch": 7.982352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.6358298063278198, "kl": 0.014636363601312041, "learning_rate": 7.513324881009768e-07, "loss": 0.0001479312777519226, "reward": 0.9026666879653931, "reward_std": 0.2753002643585205, "rewards/DrugCombAccuracyCOTORM/mean": 0.8887500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.30663496255874634, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 5428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 448.625, "completions/min_length": 376.0, "epoch": 7.983823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.8426166772842407, "kl": 0.010153753915801644, "learning_rate": 7.512215383159459e-07, "loss": 0.00010195250797551125, "reward": 0.9937499761581421, "reward_std": 0.017677659168839455, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/mean_length": 399.4375, "completions/min_length": 342.0, "epoch": 7.985294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 13.985684394836426, "kl": 0.25291329296305776, "learning_rate": 7.511105719810759e-07, "loss": 0.002496877918019891, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 500.9375, "completions/min_length": 444.0, "epoch": 7.9867647058823525, "frac_reward_zero_std": 0.5, "grad_norm": 0.8664549589157104, "kl": 0.009992395876906812, "learning_rate": 7.509995891036769e-07, "loss": 9.991598199121654e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 5431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/mean_length": 570.5625, "completions/min_length": 496.0, "epoch": 7.988235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.7049528360366821, "kl": 0.009550287621095777, "learning_rate": 7.508885896910604e-07, "loss": 9.590387344360352e-05, "reward": 0.7788888812065125, "reward_std": 0.09040765464305878, "rewards/DrugCombAccuracyCOTORM/mean": 0.737500011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.3110019564628601, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8888888955116272, "rewards/DrugCombCoverageCOTORM/std": 0.2295101284980774, "step": 5432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 503.25, "completions/min_length": 453.0, "epoch": 7.989705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.01834265887737274, "kl": 0.00957058509811759, "learning_rate": 7.507775737505381e-07, "loss": 9.701899398351088e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 458.875, "completions/min_length": 374.0, "epoch": 7.991176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9256022572517395, "kl": 0.010027424315921962, "learning_rate": 7.506665412894242e-07, "loss": 0.0001000578558887355, "reward": 0.9833333492279053, "reward_std": 0.047140445560216904, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 416.75, "completions/min_length": 309.0, "epoch": 7.992647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.0137445367872715, "kl": 0.008564539486542344, "learning_rate": 7.505554923150328e-07, "loss": 8.619307482149452e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/mean_length": 392.0, "completions/min_length": 349.0, "epoch": 7.9941176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.011987064965069294, "kl": 0.008639918058179319, "learning_rate": 7.504444268346797e-07, "loss": 8.66716873133555e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 442.75, "completions/min_length": 369.0, "epoch": 7.995588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.026513205841183662, "kl": 0.012662208173424006, "learning_rate": 7.503333448556814e-07, "loss": 0.0001249898923560977, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/mean_length": 453.125, "completions/min_length": 349.0, "epoch": 7.997058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.7281457185745239, "kl": 0.010548042366281152, "learning_rate": 7.50222246385356e-07, "loss": 0.00010591745376586914, "reward": 0.9928571581840515, "reward_std": 0.02020304463803768, "rewards/DrugCombAccuracyCOTORM/mean": 0.9910714626312256, "rewards/DrugCombAccuracyCOTORM/std": 0.0357142835855484, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/mean_length": 498.3125, "completions/min_length": 388.0, "epoch": 7.998529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.9281063675880432, "kl": 0.009649451705627143, "learning_rate": 7.501111314310222e-07, "loss": 9.668618440628052e-05, "reward": 0.7250000238418579, "reward_std": 0.10350983589887619, "rewards/DrugCombAccuracyCOTORM/mean": 0.65625, "rewards/DrugCombAccuracyCOTORM/std": 0.3966001570224762, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 433.8125, "completions/min_length": 363.0, "epoch": 8.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.021698469296097755, "kl": 0.011035516858100891, "learning_rate": 7.5e-07, "loss": 0.00010991469025611877, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/mean_length": 407.875, "completions/min_length": 368.0, "epoch": 8.001470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.010281621478497982, "kl": 0.00818850367795676, "learning_rate": 7.498888520996105e-07, "loss": 8.186690683942288e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/mean_length": 475.6875, "completions/min_length": 351.0, "epoch": 8.00294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.7685623168945312, "kl": 0.01270845322869718, "learning_rate": 7.497776877371758e-07, "loss": 0.000128820538520813, "reward": 0.7889524102210999, "reward_std": 0.13338768482208252, "rewards/DrugCombAccuracyCOTORM/mean": 0.7440029978752136, "rewards/DrugCombAccuracyCOTORM/std": 0.3357866704463959, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/mean_length": 470.125, "completions/min_length": 339.0, "epoch": 8.004411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9980484843254089, "kl": 0.010908713564276695, "learning_rate": 7.496665069200191e-07, "loss": 0.00010971991287078708, "reward": 0.7166666984558105, "reward_std": 0.16618980467319489, "rewards/DrugCombAccuracyCOTORM/mean": 0.6458333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.4629814922809601, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 453.0, "completions/min_length": 405.0, "epoch": 8.005882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.0039583444595337, "kl": 0.01558435126207769, "learning_rate": 7.495553096554649e-07, "loss": 0.00015393922512885183, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 412.625, "completions/min_length": 360.0, "epoch": 8.007352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.02485300786793232, "kl": 0.01282926625572145, "learning_rate": 7.494440959508384e-07, "loss": 0.00012824269651900977, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/mean_length": 507.25, "completions/min_length": 424.0, "epoch": 8.008823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 0.7590844631195068, "kl": 0.008345501380972564, "learning_rate": 7.493328658134658e-07, "loss": 8.348847040906549e-05, "reward": 0.75, "reward_std": 0.20701967179775238, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 422.375, "completions/min_length": 358.0, "epoch": 8.010294117647058, "frac_reward_zero_std": 0.0, "grad_norm": 1.2621383666992188, "kl": 0.01264306390658021, "learning_rate": 7.492216192506754e-07, "loss": 0.000126570463180542, "reward": 0.7705000042915344, "reward_std": 0.3333149254322052, "rewards/DrugCombAccuracyCOTORM/mean": 0.7287499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.4172669053077698, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.22360680997371674, "step": 5447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/mean_length": 484.875, "completions/min_length": 412.0, "epoch": 8.011764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9705531597137451, "kl": 0.012384939938783646, "learning_rate": 7.491103562697953e-07, "loss": 0.0001246631145477295, "reward": 0.6000000238418579, "reward_std": 0.16690459847450256, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8164966106414795, "step": 5448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 452.375, "completions/min_length": 393.0, "epoch": 8.013235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.950035572052002, "kl": 0.011096002534031868, "learning_rate": 7.489990768781552e-07, "loss": 0.00011039525270462036, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/mean_length": 528.125, "completions/min_length": 394.0, "epoch": 8.014705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 0.7905992865562439, "kl": 0.011617485200986266, "learning_rate": 7.488877810830863e-07, "loss": 0.00011574819654924795, "reward": 0.8883928656578064, "reward_std": 0.12097380310297012, "rewards/DrugCombAccuracyCOTORM/mean": 0.863095223903656, "rewards/DrugCombAccuracyCOTORM/std": 0.24948927760124207, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 5450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 478.6875, "completions/min_length": 413.0, "epoch": 8.016176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.008886863477528095, "kl": 0.00856507197022438, "learning_rate": 7.4877646889192e-07, "loss": 8.493699715472758e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 444.0625, "completions/min_length": 385.0, "epoch": 8.01764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.2082146406173706, "kl": 0.01036036352161318, "learning_rate": 7.486651403119896e-07, "loss": 0.00010402500629425049, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 495.75, "completions/min_length": 424.0, "epoch": 8.019117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.9741263389587402, "kl": 0.010253310203552246, "learning_rate": 7.485537953506291e-07, "loss": 0.00010307878255844116, "reward": 0.8927083015441895, "reward_std": 0.17847666144371033, "rewards/DrugCombAccuracyCOTORM/mean": 0.8854166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.2770128548145294, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.5072392821311951, "step": 5453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 444.0625, "completions/min_length": 403.0, "epoch": 8.020588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.8540564775466919, "kl": 0.008778910618275404, "learning_rate": 7.484424340151735e-07, "loss": 8.740276098251343e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 480.625, "completions/min_length": 406.0, "epoch": 8.022058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 0.9176322817802429, "kl": 0.011405491270124912, "learning_rate": 7.483310563129591e-07, "loss": 0.00011454088962636888, "reward": 0.5479166507720947, "reward_std": 0.03500283509492874, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.7197608351707458, "step": 5455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 421.75, "completions/min_length": 373.0, "epoch": 8.023529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.040890224277973175, "kl": 0.014756080927327275, "learning_rate": 7.482196622513233e-07, "loss": 0.00014809750427957624, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 492.6875, "completions/min_length": 427.0, "epoch": 8.025, "frac_reward_zero_std": 0.5, "grad_norm": 0.8090134263038635, "kl": 0.00957840122282505, "learning_rate": 7.481082518376041e-07, "loss": 9.597092866897583e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/mean_length": 486.0625, "completions/min_length": 383.0, "epoch": 8.026470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.02777986042201519, "kl": 0.011012767208740115, "learning_rate": 7.479968250791413e-07, "loss": 0.000109206055640243, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 433.5625, "completions/min_length": 373.0, "epoch": 8.027941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.013011826202273369, "kl": 0.008617378654889762, "learning_rate": 7.478853819832751e-07, "loss": 8.666828216519207e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 479.375, "completions/min_length": 440.0, "epoch": 8.029411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.0859761238098145, "kl": 0.01338529959321022, "learning_rate": 7.477739225573474e-07, "loss": 0.0001326538622379303, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 448.25, "completions/min_length": 369.0, "epoch": 8.030882352941177, "frac_reward_zero_std": 0.0, "grad_norm": 1.4328206777572632, "kl": 0.012816264759749174, "learning_rate": 7.476624468087007e-07, "loss": 0.00012962520122528076, "reward": 0.800000011920929, "reward_std": 0.3484410345554352, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/mean_length": 464.5, "completions/min_length": 372.0, "epoch": 8.032352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.9877608418464661, "kl": 0.00904266582801938, "learning_rate": 7.475509547446788e-07, "loss": 9.086728096008301e-05, "reward": 0.9159375429153442, "reward_std": 0.15612778067588806, "rewards/DrugCombAccuracyCOTORM/mean": 0.9007812738418579, "rewards/DrugCombAccuracyCOTORM/std": 0.27153530716896057, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.953125, "rewards/DrugCombCoverageCOTORM/std": 0.1359764039516449, "step": 5462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/mean_length": 479.375, "completions/min_length": 382.0, "epoch": 8.033823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.166663646697998, "kl": 0.010877661406993866, "learning_rate": 7.474394463726264e-07, "loss": 0.00010710087371990085, "reward": 0.9062291383743286, "reward_std": 0.07892697304487228, "rewards/DrugCombAccuracyCOTORM/mean": 0.8879947662353516, "rewards/DrugCombAccuracyCOTORM/std": 0.174006387591362, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.07453560829162598, "step": 5463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 479.8125, "completions/min_length": 410.0, "epoch": 8.035294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.8714286684989929, "kl": 0.0108212532941252, "learning_rate": 7.473279216998894e-07, "loss": 0.00010845810174942017, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 5464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 400.9375, "completions/min_length": 327.0, "epoch": 8.036764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.7263962626457214, "kl": 0.009782816749066114, "learning_rate": 7.47216380733815e-07, "loss": 9.838491678237915e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/mean_length": 505.75, "completions/min_length": 402.0, "epoch": 8.038235294117648, "frac_reward_zero_std": 0.5, "grad_norm": 0.9081218242645264, "kl": 0.011223269859328866, "learning_rate": 7.471048234817509e-07, "loss": 0.00011154943786095828, "reward": 0.8552083373069763, "reward_std": 0.09051630645990372, "rewards/DrugCombAccuracyCOTORM/mean": 0.8229166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.23935678601264954, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 5466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 432.4375, "completions/min_length": 364.0, "epoch": 8.03970588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.120299220085144, "kl": 0.00788534909952432, "learning_rate": 7.469932499510464e-07, "loss": 7.893052679719403e-05, "reward": 0.504687488079071, "reward_std": 0.013258252292871475, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0625, "rewards/DrugCombCoverageCOTORM/std": 0.9979145526885986, "step": 5467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 446.75, "completions/min_length": 361.0, "epoch": 8.041176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.7564149498939514, "kl": 0.010207070503383875, "learning_rate": 7.468816601490517e-07, "loss": 0.00010335438855690882, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 465.0625, "completions/min_length": 426.0, "epoch": 8.04264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.014179317280650139, "kl": 0.008433469105511904, "learning_rate": 7.467700540831181e-07, "loss": 8.433974289800972e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 462.5, "completions/min_length": 417.0, "epoch": 8.044117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.6796536445617676, "kl": 0.013294341741129756, "learning_rate": 7.466584317605978e-07, "loss": 0.00013312697410583496, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 453.6875, "completions/min_length": 373.0, "epoch": 8.045588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.012796069495379925, "kl": 0.0099569505546242, "learning_rate": 7.465467931888441e-07, "loss": 9.921083255903795e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 450.3125, "completions/min_length": 411.0, "epoch": 8.047058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.0463398694992065, "kl": 0.012133299140259624, "learning_rate": 7.464351383752116e-07, "loss": 0.00012080371379852295, "reward": 0.8500000238418579, "reward_std": 0.20701967179775238, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 480.625, "completions/min_length": 390.0, "epoch": 8.048529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.0209711492061615, "kl": 0.01026223658118397, "learning_rate": 7.463234673270559e-07, "loss": 0.00010213239147560671, "reward": 0.8666666746139526, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.8333333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.17213258147239685, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 446.1875, "completions/min_length": 376.0, "epoch": 8.05, "frac_reward_zero_std": 1.0, "grad_norm": 0.027564767748117447, "kl": 0.012234851717948914, "learning_rate": 7.462117800517336e-07, "loss": 0.00012291157327126712, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 466.8125, "completions/min_length": 411.0, "epoch": 8.051470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 1.083981990814209, "kl": 0.011790180578827858, "learning_rate": 7.461000765566021e-07, "loss": 0.00011810907744802535, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 435.125, "completions/min_length": 399.0, "epoch": 8.052941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.033167414367198944, "kl": 0.009995632572099566, "learning_rate": 7.459883568490207e-07, "loss": 0.00010036422463599592, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 418.8125, "completions/min_length": 384.0, "epoch": 8.054411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.022944550961256027, "kl": 0.008676776895299554, "learning_rate": 7.458766209363485e-07, "loss": 8.734039147384465e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 427.5, "completions/min_length": 353.0, "epoch": 8.055882352941177, "frac_reward_zero_std": 0.0, "grad_norm": 1.4667965173721313, "kl": 0.00994030013680458, "learning_rate": 7.457648688259471e-07, "loss": 9.976327419281006e-05, "reward": 0.5887500047683716, "reward_std": 0.3122636675834656, "rewards/DrugCombAccuracyCOTORM/mean": 0.5458333492279053, "rewards/DrugCombAccuracyCOTORM/std": 0.4833333492279053, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5208333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.7885171175003052, "step": 5478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 486.75, "completions/min_length": 432.0, "epoch": 8.05735294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.3395946025848389, "kl": 0.015644749626517296, "learning_rate": 7.456531005251778e-07, "loss": 0.00015885382890701294, "reward": 0.7267500162124634, "reward_std": 0.37712085247039795, "rewards/DrugCombAccuracyCOTORM/mean": 0.6662499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.45040538907051086, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.13437095284461975, "step": 5479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 493.5, "completions/min_length": 441.0, "epoch": 8.058823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.1142380237579346, "kl": 0.012032100232318044, "learning_rate": 7.455413160414041e-07, "loss": 0.00012145936489105225, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 5480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 416.5625, "completions/min_length": 363.0, "epoch": 8.060294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.011579575017094612, "kl": 0.007638105540536344, "learning_rate": 7.454295153819898e-07, "loss": 7.579204975627363e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/mean_length": 538.1875, "completions/min_length": 476.0, "epoch": 8.061764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 1.0107090473175049, "kl": 0.01658763736486435, "learning_rate": 7.453176985543002e-07, "loss": 0.00016715377569198608, "reward": 0.3500000238418579, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 5482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 496.6875, "completions/min_length": 451.0, "epoch": 8.063235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 0.8387313485145569, "kl": 0.010116366320289671, "learning_rate": 7.452058655657013e-07, "loss": 0.00010083243250846863, "reward": 0.637499988079071, "reward_std": 0.1505940705537796, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 451.375, "completions/min_length": 361.0, "epoch": 8.064705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.01236808393150568, "kl": 0.009860473219305277, "learning_rate": 7.450940164235606e-07, "loss": 9.923236939357594e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 400.8125, "completions/min_length": 312.0, "epoch": 8.066176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.014320053160190582, "kl": 0.01088854274712503, "learning_rate": 7.449821511352464e-07, "loss": 0.00010899620247073472, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 408.125, "completions/min_length": 353.0, "epoch": 8.06764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9496029019355774, "kl": 0.010300250840373337, "learning_rate": 7.44870269708128e-07, "loss": 0.00010252571519231424, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 5486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 463.4375, "completions/min_length": 399.0, "epoch": 8.069117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.8819228410720825, "kl": 0.010008767247200012, "learning_rate": 7.447583721495759e-07, "loss": 9.936094284057617e-05, "reward": 0.8374999761581421, "reward_std": 0.19775526225566864, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 5487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 414.375, "completions/min_length": 344.0, "epoch": 8.070588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.2643002271652222, "kl": 0.035191822331398726, "learning_rate": 7.446464584669617e-07, "loss": 0.00033721327781677246, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 458.3125, "completions/min_length": 417.0, "epoch": 8.072058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.8772486448287964, "kl": 0.013133369851857424, "learning_rate": 7.44534528667658e-07, "loss": 0.00012919679284095764, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 481.6875, "completions/min_length": 428.0, "epoch": 8.073529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 1.2238792181015015, "kl": 0.01158501417376101, "learning_rate": 7.444225827590384e-07, "loss": 0.0001160532483481802, "reward": 0.9666666984558105, "reward_std": 0.061721328645944595, "rewards/DrugCombAccuracyCOTORM/mean": 0.9583333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.11385500431060791, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/mean_length": 467.125, "completions/min_length": 390.0, "epoch": 8.075, "frac_reward_zero_std": 0.0, "grad_norm": 1.359549880027771, "kl": 0.011991542065516114, "learning_rate": 7.443106207484775e-07, "loss": 0.00011861324310302734, "reward": 0.6349999904632568, "reward_std": 0.36353474855422974, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8500000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.4979960024356842, "step": 5491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 444.375, "completions/min_length": 389.0, "epoch": 8.076470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.015558093786239624, "kl": 0.010266531957313418, "learning_rate": 7.441986426433511e-07, "loss": 0.00010352977551519871, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/mean_length": 538.9375, "completions/min_length": 451.0, "epoch": 8.077941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.208889126777649, "kl": 0.009541562758386135, "learning_rate": 7.440866484510362e-07, "loss": 9.593367576599121e-05, "reward": 0.7114583253860474, "reward_std": 0.2871284484863281, "rewards/DrugCombAccuracyCOTORM/mean": 0.6458333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.4121982753276825, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9479166269302368, "rewards/DrugCombCoverageCOTORM/std": 0.07978560030460358, "step": 5493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 483.1875, "completions/min_length": 421.0, "epoch": 8.079411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.9043773412704468, "kl": 0.015103450743481517, "learning_rate": 7.439746381789107e-07, "loss": 0.00014716418809257448, "reward": 0.887499988079071, "reward_std": 0.21001699566841125, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 483.625, "completions/min_length": 418.0, "epoch": 8.080882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.9599318504333496, "kl": 0.01421005162410438, "learning_rate": 7.438626118343533e-07, "loss": 0.00014276507135946304, "reward": 0.625249981880188, "reward_std": 0.049365732818841934, "rewards/DrugCombAccuracyCOTORM/mean": 0.5550000071525574, "rewards/DrugCombAccuracyCOTORM/std": 0.4665619134902954, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.20069323480129242, "step": 5495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 440.5625, "completions/min_length": 377.0, "epoch": 8.08235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.014535165391862392, "kl": 0.00952356681227684, "learning_rate": 7.437505694247444e-07, "loss": 9.522549225948751e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 424.75, "completions/min_length": 353.0, "epoch": 8.083823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 3.068523406982422, "kl": 0.019494349835440516, "learning_rate": 7.436385109574647e-07, "loss": 0.00019226087897550315, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 468.5625, "completions/min_length": 400.0, "epoch": 8.08529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9641335010528564, "kl": 0.011089749168604612, "learning_rate": 7.435264364398963e-07, "loss": 0.00011005255510099232, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 437.5625, "completions/min_length": 362.0, "epoch": 8.086764705882352, "frac_reward_zero_std": 1.0, "grad_norm": 0.023277878761291504, "kl": 0.011185828829184175, "learning_rate": 7.434143458794227e-07, "loss": 0.00011271124822087586, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 442.375, "completions/min_length": 401.0, "epoch": 8.088235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.902860164642334, "kl": 0.010191290639340878, "learning_rate": 7.43302239283428e-07, "loss": 0.00010143965482711792, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 450.6875, "completions/min_length": 379.0, "epoch": 8.089705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.021708894520998, "kl": 0.010496954200789332, "learning_rate": 7.431901166592975e-07, "loss": 0.00010367513459641486, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 483.0625, "completions/min_length": 439.0, "epoch": 8.091176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.9853312969207764, "kl": 0.010583623545244336, "learning_rate": 7.430779780144177e-07, "loss": 0.00010471045970916748, "reward": 0.8500000238418579, "reward_std": 0.09258200973272324, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/mean_length": 402.5, "completions/min_length": 374.0, "epoch": 8.092647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.037147097289562225, "kl": 0.014389727497473359, "learning_rate": 7.429658233561758e-07, "loss": 0.00014339096378535032, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 436.75, "completions/min_length": 372.0, "epoch": 8.094117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.013720744289457798, "kl": 0.0076545183546841145, "learning_rate": 7.428536526919602e-07, "loss": 7.701035065110773e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 417.5, "completions/min_length": 369.0, "epoch": 8.095588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.027017472311854362, "kl": 0.01265023578889668, "learning_rate": 7.427414660291606e-07, "loss": 0.00012569392856676131, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 448.75, "completions/min_length": 392.0, "epoch": 8.097058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.046459831297397614, "kl": 0.01437078695744276, "learning_rate": 7.426292633751675e-07, "loss": 0.0001422580680809915, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 448.8125, "completions/min_length": 424.0, "epoch": 8.098529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.009781137108802795, "kl": 0.007802851847372949, "learning_rate": 7.425170447373725e-07, "loss": 7.786850619595498e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 458.875, "completions/min_length": 415.0, "epoch": 8.1, "frac_reward_zero_std": 1.0, "grad_norm": 0.02238410711288452, "kl": 0.010718406876549125, "learning_rate": 7.424048101231686e-07, "loss": 0.0001070587313733995, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 479.4375, "completions/min_length": 406.0, "epoch": 8.101470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8586673140525818, "kl": 0.012611623154953122, "learning_rate": 7.42292559539949e-07, "loss": 0.00012621140922419727, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 425.4375, "completions/min_length": 366.0, "epoch": 8.102941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.021036865189671516, "kl": 0.011912528774701059, "learning_rate": 7.421802929951088e-07, "loss": 0.00011983614240307361, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 447.875, "completions/min_length": 409.0, "epoch": 8.104411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.01070116925984621, "kl": 0.009652915643528104, "learning_rate": 7.420680104960437e-07, "loss": 9.633832087274641e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 433.0, "completions/min_length": 383.0, "epoch": 8.105882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.01801603101193905, "kl": 0.011161274276673794, "learning_rate": 7.419557120501507e-07, "loss": 0.00011167550110258162, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 5512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 472.25, "completions/min_length": 401.0, "epoch": 8.10735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.06561312079429626, "kl": 0.0166148713324219, "learning_rate": 7.418433976648277e-07, "loss": 0.00016777838754933327, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 457.75, "completions/min_length": 391.0, "epoch": 8.108823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.8675500750541687, "kl": 0.013087718980386853, "learning_rate": 7.41731067347474e-07, "loss": 0.00012972102558705956, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 448.625, "completions/min_length": 399.0, "epoch": 8.110294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.008357926271855831, "kl": 0.008094608667306602, "learning_rate": 7.416187211054889e-07, "loss": 8.108130714390427e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/mean_length": 563.3125, "completions/min_length": 444.0, "epoch": 8.111764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.143539547920227, "kl": 0.009719061432406306, "learning_rate": 7.415063589462741e-07, "loss": 9.834021329879761e-05, "reward": 0.716675877571106, "reward_std": 0.30043095350265503, "rewards/DrugCombAccuracyCOTORM/mean": 0.6562615036964417, "rewards/DrugCombAccuracyCOTORM/std": 0.38175004720687866, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.17916128039360046, "step": 5516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 454.8125, "completions/min_length": 413.0, "epoch": 8.113235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.014532634988427162, "kl": 0.011646683793514967, "learning_rate": 7.413939808772316e-07, "loss": 0.00011562304280232638, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/mean_length": 480.875, "completions/min_length": 398.0, "epoch": 8.114705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 0.8317146897315979, "kl": 0.009091117652133107, "learning_rate": 7.412815869057643e-07, "loss": 9.099587623495609e-05, "reward": 0.7051249742507935, "reward_std": 0.2487887442111969, "rewards/DrugCombAccuracyCOTORM/mean": 0.6978124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.4644816815853119, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.46875, "rewards/DrugCombCoverageCOTORM/std": 0.8844725489616394, "step": 5518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 477.9375, "completions/min_length": 386.0, "epoch": 8.116176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.020779728889465332, "kl": 0.010349837597459555, "learning_rate": 7.411691770392769e-07, "loss": 0.00010402090265415609, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 443.0625, "completions/min_length": 393.0, "epoch": 8.117647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.930804967880249, "kl": 0.011145031778141856, "learning_rate": 7.410567512851744e-07, "loss": 0.00011217634164495394, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 5520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 418.875, "completions/min_length": 393.0, "epoch": 8.119117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.029193636029958725, "kl": 0.008165409904904664, "learning_rate": 7.409443096508632e-07, "loss": 8.13533624750562e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 418.0625, "completions/min_length": 365.0, "epoch": 8.120588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.014193287119269371, "kl": 0.00962309562601149, "learning_rate": 7.408318521437506e-07, "loss": 9.62292033364065e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 448.1875, "completions/min_length": 377.0, "epoch": 8.12205882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.01663164235651493, "kl": 0.013412327505648136, "learning_rate": 7.407193787712449e-07, "loss": 0.0001359970192424953, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 454.125, "completions/min_length": 404.0, "epoch": 8.123529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.01056613028049469, "kl": 0.007311663939617574, "learning_rate": 7.406068895407557e-07, "loss": 7.278336124727502e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 444.1875, "completions/min_length": 402.0, "epoch": 8.125, "frac_reward_zero_std": 0.5, "grad_norm": 0.9306512475013733, "kl": 0.011962000746279955, "learning_rate": 7.404943844596938e-07, "loss": 0.00011958907271036878, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 5525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/mean_length": 515.0625, "completions/min_length": 446.0, "epoch": 8.126470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.015476319007575512, "kl": 0.008932174532674253, "learning_rate": 7.403818635354704e-07, "loss": 8.860979869496077e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/mean_length": 525.125, "completions/min_length": 409.0, "epoch": 8.12794117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.0337787866592407, "kl": 0.013904021354392171, "learning_rate": 7.40269326775498e-07, "loss": 0.0001383572816848755, "reward": 0.8348009586334229, "reward_std": 0.15673960745334625, "rewards/DrugCombAccuracyCOTORM/mean": 0.8097772598266602, "rewards/DrugCombAccuracyCOTORM/std": 0.29358023405075073, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8697916865348816, "rewards/DrugCombCoverageCOTORM/std": 0.4990442395210266, "step": 5527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 470.375, "completions/min_length": 409.0, "epoch": 8.129411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.022777313366532326, "kl": 0.01010969327762723, "learning_rate": 7.401567741871905e-07, "loss": 0.00010085197573062032, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 435.0, "completions/min_length": 397.0, "epoch": 8.130882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.8905735015869141, "kl": 0.014897673157975078, "learning_rate": 7.400442057779623e-07, "loss": 0.00014697801088914275, "reward": 0.6937500238418579, "reward_std": 0.1898072361946106, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 394.6875, "completions/min_length": 308.0, "epoch": 8.132352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.022227531298995018, "kl": 0.011614153278060257, "learning_rate": 7.399316215552295e-07, "loss": 0.00011508013994898647, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 460.5625, "completions/min_length": 413.0, "epoch": 8.133823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.015609514899551868, "kl": 0.011787468800321221, "learning_rate": 7.398190215264086e-07, "loss": 0.00011679193994496018, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 411.8125, "completions/min_length": 345.0, "epoch": 8.135294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 1.0080902576446533, "kl": 0.011029245099052787, "learning_rate": 7.397064056989175e-07, "loss": 0.00010956823825836182, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/mean_length": 451.125, "completions/min_length": 329.0, "epoch": 8.136764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8774693608283997, "kl": 0.011554103810340166, "learning_rate": 7.39593774080175e-07, "loss": 0.00011698901653289795, "reward": 0.6883749961853027, "reward_std": 0.19739587604999542, "rewards/DrugCombAccuracyCOTORM/mean": 0.6456249952316284, "rewards/DrugCombAccuracyCOTORM/std": 0.47505396604537964, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.71875, "rewards/DrugCombCoverageCOTORM/std": 0.5153881907463074, "step": 5533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 468.375, "completions/min_length": 425.0, "epoch": 8.138235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.016464397311210632, "kl": 0.009903290309011936, "learning_rate": 7.39481126677601e-07, "loss": 9.873555973172188e-05, "reward": 0.6410000324249268, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5824999809265137, "rewards/DrugCombAccuracyCOTORM/std": 0.43119215965270996, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 5534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/mean_length": 489.0625, "completions/min_length": 384.0, "epoch": 8.139705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.303757667541504, "kl": 0.009279330144636333, "learning_rate": 7.393684634986163e-07, "loss": 9.308010339736938e-05, "reward": 0.7790333032608032, "reward_std": 0.18495431542396545, "rewards/DrugCombAccuracyCOTORM/mean": 0.7383750081062317, "rewards/DrugCombAccuracyCOTORM/std": 0.4035550057888031, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8833333253860474, "rewards/DrugCombCoverageCOTORM/std": 0.1797116994857788, "step": 5535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 505.3125, "completions/min_length": 418.0, "epoch": 8.141176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 1.293083667755127, "kl": 0.010994430631399155, "learning_rate": 7.392557845506432e-07, "loss": 0.00010912120342254639, "reward": 0.3812500238418579, "reward_std": 0.4462881088256836, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3125, "rewards/DrugCombCoverageCOTORM/std": 0.7932003140449524, "step": 5536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 508.1875, "completions/min_length": 448.0, "epoch": 8.14264705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.1983221769332886, "kl": 0.012778485077433288, "learning_rate": 7.391430898411044e-07, "loss": 0.00012683868408203125, "reward": 0.8423294425010681, "reward_std": 0.22871537506580353, "rewards/DrugCombAccuracyCOTORM/mean": 0.8029117584228516, "rewards/DrugCombAccuracyCOTORM/std": 0.4001695215702057, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 465.5, "completions/min_length": 420.0, "epoch": 8.144117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.022860953584313393, "kl": 0.013705601100809872, "learning_rate": 7.390303793774241e-07, "loss": 0.0001365338684991002, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 438.3125, "completions/min_length": 374.0, "epoch": 8.145588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.010041574016213417, "kl": 0.008160754456184804, "learning_rate": 7.389176531670273e-07, "loss": 8.129225898301229e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 461.5625, "completions/min_length": 378.0, "epoch": 8.147058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 0.8130742907524109, "kl": 0.010086743859574199, "learning_rate": 7.3880491121734e-07, "loss": 0.0001011609347187914, "reward": 0.5874999761581421, "reward_std": 0.0353553369641304, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 456.25, "completions/min_length": 399.0, "epoch": 8.148529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8741571307182312, "kl": 0.009902487276121974, "learning_rate": 7.386921535357897e-07, "loss": 0.00010002776980400085, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 457.25, "completions/min_length": 366.0, "epoch": 8.15, "frac_reward_zero_std": 0.5, "grad_norm": 0.8004501461982727, "kl": 0.008040831773541868, "learning_rate": 7.385793801298042e-07, "loss": 8.012726902961731e-05, "reward": 0.7479166984558105, "reward_std": 0.2088208645582199, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 5542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 461.5, "completions/min_length": 401.0, "epoch": 8.151470588235295, "frac_reward_zero_std": 0.0, "grad_norm": 1.2543329000473022, "kl": 0.011721538263373077, "learning_rate": 7.384665910068128e-07, "loss": 0.00011890381574630737, "reward": 0.9312499761581421, "reward_std": 0.1944543719291687, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5439056158065796, "step": 5543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 456.4375, "completions/min_length": 358.0, "epoch": 8.152941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.4178613424301147, "kl": 0.013293363153934479, "learning_rate": 7.383537861742463e-07, "loss": 0.00012986361980438232, "reward": 0.8145833015441895, "reward_std": 0.24087008833885193, "rewards/DrugCombAccuracyCOTORM/mean": 0.7916666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.28867512941360474, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.3095695972442627, "step": 5544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/mean_length": 533.125, "completions/min_length": 440.0, "epoch": 8.154411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.0033643245697021, "kl": 0.01177139999344945, "learning_rate": 7.382409656395352e-07, "loss": 0.00011881733371410519, "reward": 0.6000000238418579, "reward_std": 0.16256865859031677, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.632455587387085, "step": 5545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 455.5, "completions/min_length": 398.0, "epoch": 8.155882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.1812084913253784, "kl": 0.010373735567554832, "learning_rate": 7.381281294101122e-07, "loss": 0.0001055002212524414, "reward": 0.762499988079071, "reward_std": 0.25599944591522217, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 5546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/mean_length": 549.25, "completions/min_length": 504.0, "epoch": 8.157352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.4345216751098633, "kl": 0.011859666323289275, "learning_rate": 7.380152774934109e-07, "loss": 0.00011858344078063965, "reward": 0.6839582920074463, "reward_std": 0.27863410115242004, "rewards/DrugCombAccuracyCOTORM/mean": 0.6309895515441895, "rewards/DrugCombAccuracyCOTORM/std": 0.41091442108154297, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.28218722343444824, "step": 5547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 458.1875, "completions/min_length": 416.0, "epoch": 8.158823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.028239013627171516, "kl": 0.01115958346053958, "learning_rate": 7.379024098968653e-07, "loss": 0.0001124547197832726, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 469.625, "completions/min_length": 430.0, "epoch": 8.160294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.6722656488418579, "kl": 0.013147312449291348, "learning_rate": 7.377895266279109e-07, "loss": 0.00012990087270736694, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/mean_length": 493.5, "completions/min_length": 424.0, "epoch": 8.161764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.4096647500991821, "kl": 0.01257279864512384, "learning_rate": 7.376766276939845e-07, "loss": 0.0001264810562133789, "reward": 0.2657391428947449, "reward_std": 0.19456449151039124, "rewards/DrugCombAccuracyCOTORM/mean": 0.1212364137172699, "rewards/DrugCombAccuracyCOTORM/std": 0.24712517857551575, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.49916598200798035, "step": 5550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 427.9375, "completions/min_length": 370.0, "epoch": 8.163235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.012926727533340454, "kl": 0.010450308327563107, "learning_rate": 7.375637131025232e-07, "loss": 0.00010391582327429205, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 428.125, "completions/min_length": 370.0, "epoch": 8.16470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.25408136844635, "kl": 0.010802755481563509, "learning_rate": 7.374507828609656e-07, "loss": 0.00010842084884643555, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 467.4375, "completions/min_length": 396.0, "epoch": 8.166176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.01901586540043354, "kl": 0.012867445591837168, "learning_rate": 7.373378369767514e-07, "loss": 0.00012868651538155973, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/mean_length": 418.4375, "completions/min_length": 368.0, "epoch": 8.16764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.0185826625674963, "kl": 0.011058511212468147, "learning_rate": 7.372248754573213e-07, "loss": 0.00010955616016872227, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/mean_length": 484.75, "completions/min_length": 413.0, "epoch": 8.169117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.9815144538879395, "kl": 0.012029972858726978, "learning_rate": 7.371118983101165e-07, "loss": 0.00012005865573883057, "reward": 0.7124999761581421, "reward_std": 0.2386719137430191, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.6191391944885254, "step": 5555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 434.75, "completions/min_length": 375.0, "epoch": 8.170588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.3677756786346436, "kl": 0.0103456616634503, "learning_rate": 7.369989055425801e-07, "loss": 0.00010301300790160894, "reward": 0.574999988079071, "reward_std": 0.04629100486636162, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 5556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 511.25, "completions/min_length": 460.0, "epoch": 8.172058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.900244951248169, "kl": 0.00819714623503387, "learning_rate": 7.368858971621554e-07, "loss": 8.265674114227295e-05, "reward": 0.9802083373069763, "reward_std": 0.055979274213314056, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 5557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 489.875, "completions/min_length": 422.0, "epoch": 8.173529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8688834309577942, "kl": 0.010613395599648356, "learning_rate": 7.367728731762874e-07, "loss": 0.00010681502317311242, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 5558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 498.625, "completions/min_length": 416.0, "epoch": 8.175, "frac_reward_zero_std": 1.0, "grad_norm": 0.01749263145029545, "kl": 0.009569214773364365, "learning_rate": 7.366598335924217e-07, "loss": 9.590831177774817e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 473.9375, "completions/min_length": 391.0, "epoch": 8.176470588235293, "frac_reward_zero_std": 0.0, "grad_norm": 1.0965588092803955, "kl": 0.0080638857325539, "learning_rate": 7.365467784180051e-07, "loss": 8.111447095870972e-05, "reward": 0.7928333282470703, "reward_std": 0.3835981488227844, "rewards/DrugCombAccuracyCOTORM/mean": 0.7775000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.4020530879497528, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7083333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.6763190627098083, "step": 5560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 462.25, "completions/min_length": 395.0, "epoch": 8.177941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.0380992665886879, "kl": 0.012783235171809793, "learning_rate": 7.364337076604852e-07, "loss": 0.00012743155821226537, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 431.625, "completions/min_length": 374.0, "epoch": 8.179411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.8361026644706726, "kl": 0.013857344398275018, "learning_rate": 7.363206213273112e-07, "loss": 0.00013793978723697364, "reward": 0.5589166879653931, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.45125001668930054, "rewards/DrugCombAccuracyCOTORM/std": 0.502684473991394, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 5562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 435.875, "completions/min_length": 379.0, "epoch": 8.180882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.016775522381067276, "kl": 0.010955982375890017, "learning_rate": 7.362075194259325e-07, "loss": 0.00010964084503939375, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 452.875, "completions/min_length": 400.0, "epoch": 8.18235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9418923258781433, "kl": 0.013096715789288282, "learning_rate": 7.360944019638003e-07, "loss": 0.000129577616462484, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 448.6875, "completions/min_length": 327.0, "epoch": 8.183823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.017554091289639473, "kl": 0.011457276763394475, "learning_rate": 7.359812689483664e-07, "loss": 0.00011441632523201406, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/mean_length": 483.9375, "completions/min_length": 389.0, "epoch": 8.185294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.239633321762085, "kl": 0.010069240583106875, "learning_rate": 7.358681203870836e-07, "loss": 0.0001031532883644104, "reward": 0.8837500214576721, "reward_std": 0.29338207840919495, "rewards/DrugCombAccuracyCOTORM/mean": 0.8729166984558105, "rewards/DrugCombAccuracyCOTORM/std": 0.3070477843284607, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5013870000839233, "step": 5566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 821.0, "completions/mean_length": 579.25, "completions/min_length": 458.0, "epoch": 8.186764705882354, "frac_reward_zero_std": 0.0, "grad_norm": 1.1117274761199951, "kl": 0.01042950083501637, "learning_rate": 7.35754956287406e-07, "loss": 0.00010330229997634888, "reward": 0.5945416688919067, "reward_std": 0.4053385555744171, "rewards/DrugCombAccuracyCOTORM/mean": 0.575208306312561, "rewards/DrugCombAccuracyCOTORM/std": 0.43082690238952637, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.34375, "rewards/DrugCombCoverageCOTORM/std": 0.8508574366569519, "step": 5567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 478.625, "completions/min_length": 433.0, "epoch": 8.188235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 1.0021132230758667, "kl": 0.011107828235253692, "learning_rate": 7.356417766567886e-07, "loss": 0.00011117088433820754, "reward": 0.9089166522026062, "reward_std": 0.16972768306732178, "rewards/DrugCombAccuracyCOTORM/mean": 0.8887500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.30663496255874634, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 5568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 453.0, "completions/min_length": 401.0, "epoch": 8.189705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.014642876572906971, "kl": 0.011555507313460112, "learning_rate": 7.355285815026872e-07, "loss": 0.00011540674313437194, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/mean_length": 462.25, "completions/min_length": 375.0, "epoch": 8.191176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.009611864574253559, "kl": 0.008108005858957767, "learning_rate": 7.354153708325588e-07, "loss": 8.103169966489077e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 5570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 417.1875, "completions/min_length": 354.0, "epoch": 8.19264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9567974209785461, "kl": 0.007900057244114578, "learning_rate": 7.353021446538616e-07, "loss": 7.875263690948486e-05, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 5571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/mean_length": 533.9375, "completions/min_length": 426.0, "epoch": 8.194117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.9904191493988037, "kl": 0.011288477224297822, "learning_rate": 7.351889029740547e-07, "loss": 0.0001122314715757966, "reward": 0.7164566516876221, "reward_std": 0.1210370659828186, "rewards/DrugCombAccuracyCOTORM/mean": 0.6638000011444092, "rewards/DrugCombAccuracyCOTORM/std": 0.39624133706092834, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.2479172646999359, "step": 5572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 510.6875, "completions/min_length": 443.0, "epoch": 8.195588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.9093258380889893, "kl": 0.010181231191381812, "learning_rate": 7.350756458005979e-07, "loss": 0.00010141730308532715, "reward": 0.5178333520889282, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.4025000035762787, "rewards/DrugCombAccuracyCOTORM/std": 0.4833701252937317, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 5573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 496.9375, "completions/min_length": 453.0, "epoch": 8.197058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.2370328903198242, "kl": 0.010235644644126296, "learning_rate": 7.349623731409527e-07, "loss": 0.00010281801223754883, "reward": 0.5901666879653931, "reward_std": 0.34445154666900635, "rewards/DrugCombAccuracyCOTORM/mean": 0.5762500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.49902406334877014, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.2916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.9339284300804138, "step": 5574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/mean_length": 413.9375, "completions/min_length": 354.0, "epoch": 8.198529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.022419622167944908, "kl": 0.009836649289354682, "learning_rate": 7.348490850025807e-07, "loss": 9.883664461085573e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 419.5625, "completions/min_length": 356.0, "epoch": 8.2, "frac_reward_zero_std": 1.0, "grad_norm": 0.030563050881028175, "kl": 0.011833950877189636, "learning_rate": 7.347357813929454e-07, "loss": 0.0001188385795103386, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 447.5, "completions/min_length": 403.0, "epoch": 8.201470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.262830376625061, "kl": 0.013590910006314516, "learning_rate": 7.346224623195109e-07, "loss": 0.00013600289821624756, "reward": 0.7749999761581421, "reward_std": 0.41661903262138367, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 5577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/mean_length": 516.3125, "completions/min_length": 416.0, "epoch": 8.202941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.7997162938117981, "kl": 0.012176294112578034, "learning_rate": 7.345091277897423e-07, "loss": 0.00012203229562146589, "reward": 0.49166667461395264, "reward_std": 0.08626703917980194, "rewards/DrugCombAccuracyCOTORM/mean": 0.390625, "rewards/DrugCombAccuracyCOTORM/std": 0.20870135724544525, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.043033141642808914, "step": 5578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/mean_length": 510.1875, "completions/min_length": 382.0, "epoch": 8.204411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.658223032951355, "kl": 0.010115811368450522, "learning_rate": 7.343957778111058e-07, "loss": 0.00010166921128984541, "reward": 0.8487499952316284, "reward_std": 0.20876763761043549, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.987500011920929, "rewards/DrugCombCoverageCOTORM/std": 0.05000000074505806, "step": 5579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 452.0, "completions/min_length": 401.0, "epoch": 8.205882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.017196403816342354, "kl": 0.010625035036355257, "learning_rate": 7.342824123910687e-07, "loss": 0.00010686744644772261, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 426.4375, "completions/min_length": 368.0, "epoch": 8.20735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.007847162894904613, "kl": 0.006385354790836573, "learning_rate": 7.341690315370992e-07, "loss": 6.399054836947471e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 527.5625, "completions/min_length": 472.0, "epoch": 8.208823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.01241553109139204, "kl": 0.007679559173993766, "learning_rate": 7.340556352566663e-07, "loss": 7.676896348129958e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 428.5625, "completions/min_length": 374.0, "epoch": 8.21029411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.013967839069664478, "kl": 0.010073852259665728, "learning_rate": 7.339422235572407e-07, "loss": 0.0001008139006444253, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 496.0625, "completions/min_length": 424.0, "epoch": 8.211764705882352, "frac_reward_zero_std": 0.0, "grad_norm": 1.0981498956680298, "kl": 0.01087829191237688, "learning_rate": 7.338287964462933e-07, "loss": 0.00010849535465240479, "reward": 0.6656249761581421, "reward_std": 0.38315731287002563, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.65625, "rewards/DrugCombCoverageCOTORM/std": 0.4732423722743988, "step": 5584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 428.375, "completions/min_length": 377.0, "epoch": 8.213235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01743011176586151, "kl": 0.010658559738658369, "learning_rate": 7.337153539312967e-07, "loss": 0.00010677229875000194, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 418.0625, "completions/min_length": 374.0, "epoch": 8.214705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.013077628798782825, "kl": 0.016714795725420117, "learning_rate": 7.33601896019724e-07, "loss": 0.00016628643788862973, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 442.9375, "completions/min_length": 406.0, "epoch": 8.216176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.011427225545048714, "kl": 0.00782298727426678, "learning_rate": 7.334884227190495e-07, "loss": 7.824286876711994e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 459.3125, "completions/min_length": 398.0, "epoch": 8.217647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.9633538126945496, "kl": 0.010642457054927945, "learning_rate": 7.333749340367486e-07, "loss": 0.00010640585969667882, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 437.0625, "completions/min_length": 333.0, "epoch": 8.219117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.9527754187583923, "kl": 0.012891213642433286, "learning_rate": 7.332614299802976e-07, "loss": 0.00012903660535812378, "reward": 0.9551249742507935, "reward_std": 0.12692566215991974, "rewards/DrugCombAccuracyCOTORM/mean": 0.9478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.20874999463558197, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 5589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 453.75, "completions/min_length": 402.0, "epoch": 8.220588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9720118641853333, "kl": 0.013991829473525286, "learning_rate": 7.33147910557174e-07, "loss": 0.00014030211605131626, "reward": 0.7875000238418579, "reward_std": 0.2295181304216385, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 454.3125, "completions/min_length": 386.0, "epoch": 8.222058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.022113872691988945, "kl": 0.010847209952771664, "learning_rate": 7.330343757748561e-07, "loss": 0.00010803918121382594, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 437.0, "completions/min_length": 390.0, "epoch": 8.223529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.012118564918637276, "kl": 0.009780948050320148, "learning_rate": 7.329208256408233e-07, "loss": 9.763494017533958e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 431.0, "completions/min_length": 378.0, "epoch": 8.225, "frac_reward_zero_std": 1.0, "grad_norm": 0.009297950193285942, "kl": 0.008854278945364058, "learning_rate": 7.328072601625557e-07, "loss": 8.903382695280015e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 442.125, "completions/min_length": 408.0, "epoch": 8.226470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.009356116876006126, "kl": 0.007440596004016697, "learning_rate": 7.326936793475351e-07, "loss": 7.463796646334231e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 483.9375, "completions/min_length": 434.0, "epoch": 8.227941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 0.8410958051681519, "kl": 0.014322295086458325, "learning_rate": 7.325800832032438e-07, "loss": 0.00014271023974288255, "reward": 0.5375000238418579, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 448.9375, "completions/min_length": 412.0, "epoch": 8.229411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9969334602355957, "kl": 0.00985533744096756, "learning_rate": 7.324664717371652e-07, "loss": 9.816139936447144e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 440.125, "completions/min_length": 381.0, "epoch": 8.230882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.021365784108638763, "kl": 0.009456758736632764, "learning_rate": 7.323528449567838e-07, "loss": 9.42397746257484e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 438.8125, "completions/min_length": 393.0, "epoch": 8.23235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9130268692970276, "kl": 0.01186886872164905, "learning_rate": 7.32239202869585e-07, "loss": 0.00011855881166411564, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 437.5625, "completions/min_length": 382.0, "epoch": 8.233823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.0660011768341064, "kl": 0.009140871814452112, "learning_rate": 7.32125545483055e-07, "loss": 9.184890950564295e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 498.625, "completions/min_length": 455.0, "epoch": 8.235294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.7428262829780579, "kl": 0.011458860244601965, "learning_rate": 7.320118728046816e-07, "loss": 0.0001146271824836731, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 453.0, "completions/min_length": 410.0, "epoch": 8.236764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.010907020419836044, "kl": 0.009280717349611223, "learning_rate": 7.318981848419533e-07, "loss": 9.283586405217648e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 456.0625, "completions/min_length": 394.0, "epoch": 8.238235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.025197399780154228, "kl": 0.011806606547906995, "learning_rate": 7.317844816023594e-07, "loss": 0.0001176372024929151, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 5602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/mean_length": 417.5, "completions/min_length": 375.0, "epoch": 8.239705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.006797169800847769, "kl": 0.006197368493303657, "learning_rate": 7.316707630933904e-07, "loss": 6.152110290713608e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 448.625, "completions/min_length": 395.0, "epoch": 8.241176470588234, "frac_reward_zero_std": 0.5, "grad_norm": 1.031915307044983, "kl": 0.011573216528631747, "learning_rate": 7.31557029322538e-07, "loss": 0.00011678649025270715, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/mean_length": 487.6875, "completions/min_length": 371.0, "epoch": 8.242647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.027292251586914, "kl": 0.011693413835018873, "learning_rate": 7.314432802972941e-07, "loss": 0.00011796504259109497, "reward": 0.9083333015441895, "reward_std": 0.18408934772014618, "rewards/DrugCombAccuracyCOTORM/mean": 0.9166666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 5605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 931.0, "completions/mean_length": 579.875, "completions/min_length": 474.0, "epoch": 8.244117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 1.124292254447937, "kl": 0.010882086353376508, "learning_rate": 7.313295160251529e-07, "loss": 0.0001098141074180603, "reward": 0.4490327537059784, "reward_std": 0.4041435718536377, "rewards/DrugCombAccuracyCOTORM/mean": 0.3913690447807312, "rewards/DrugCombAccuracyCOTORM/std": 0.42812755703926086, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.8595864772796631, "step": 5606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/mean_length": 510.625, "completions/min_length": 446.0, "epoch": 8.245588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.7839023470878601, "kl": 0.011744368122890592, "learning_rate": 7.312157365136086e-07, "loss": 0.00011829286813735962, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 5607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 453.0625, "completions/min_length": 421.0, "epoch": 8.24705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 402.1475830078125, "kl": 1.9812203086912632, "learning_rate": 7.311019417701566e-07, "loss": 0.021229207515716553, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 470.625, "completions/min_length": 358.0, "epoch": 8.248529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.8674020767211914, "kl": 0.013175387168303132, "learning_rate": 7.309881318022934e-07, "loss": 0.0001315629342570901, "reward": 0.8356666564941406, "reward_std": 0.17567972838878632, "rewards/DrugCombAccuracyCOTORM/mean": 0.8050000071525574, "rewards/DrugCombAccuracyCOTORM/std": 0.3488266170024872, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.14907118678092957, "step": 5609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 442.875, "completions/min_length": 346.0, "epoch": 8.25, "frac_reward_zero_std": 0.5, "grad_norm": 1.3736193180084229, "kl": 0.015171082923188806, "learning_rate": 7.30874306617517e-07, "loss": 0.0001497231423854828, "reward": 0.1731666624546051, "reward_std": 0.052635613828897476, "rewards/DrugCombAccuracyCOTORM/mean": 0.054999999701976776, "rewards/DrugCombAccuracyCOTORM/std": 0.0983869880437851, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.2916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 5610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 459.125, "completions/min_length": 362.0, "epoch": 8.251470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 2.0136613845825195, "kl": 0.009409436956048012, "learning_rate": 7.307604662233253e-07, "loss": 9.395007509738207e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/mean_length": 513.375, "completions/min_length": 433.0, "epoch": 8.25294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.321824550628662, "kl": 0.011531151132658124, "learning_rate": 7.306466106272182e-07, "loss": 0.00011536478996276855, "reward": 0.8374999761581421, "reward_std": 0.34973084926605225, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 5612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/mean_length": 403.5, "completions/min_length": 377.0, "epoch": 8.254411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.013541107065975666, "kl": 0.008100046368781477, "learning_rate": 7.30532739836696e-07, "loss": 8.100893319351599e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/mean_length": 413.8125, "completions/min_length": 374.0, "epoch": 8.255882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.011414101347327232, "kl": 0.00972112501040101, "learning_rate": 7.304188538592603e-07, "loss": 9.730901365401223e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 424.0625, "completions/min_length": 368.0, "epoch": 8.257352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.02275177463889122, "kl": 0.010065862210467458, "learning_rate": 7.303049527024136e-07, "loss": 9.993075218517333e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 461.3125, "completions/min_length": 361.0, "epoch": 8.258823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 0.9122031927108765, "kl": 0.009388625272549689, "learning_rate": 7.301910363736595e-07, "loss": 9.388574108015746e-05, "reward": 0.8072916865348816, "reward_std": 0.021391337737441063, "rewards/DrugCombAccuracyCOTORM/mean": 0.7708333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.24247947335243225, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.16770510375499725, "step": 5616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 423.8125, "completions/min_length": 362.0, "epoch": 8.260294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.017058534547686577, "kl": 0.009610721841454506, "learning_rate": 7.300771048805025e-07, "loss": 9.589446563040838e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 453.0, "completions/min_length": 394.0, "epoch": 8.261764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8668040633201599, "kl": 0.009390344377607107, "learning_rate": 7.29963158230448e-07, "loss": 9.45503925322555e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 406.3125, "completions/min_length": 368.0, "epoch": 8.263235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.008445139974355698, "kl": 0.007366906385868788, "learning_rate": 7.298491964310026e-07, "loss": 7.375451968982816e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 427.0625, "completions/min_length": 389.0, "epoch": 8.264705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 0.850936770439148, "kl": 0.00957079348154366, "learning_rate": 7.297352194896737e-07, "loss": 9.545765351504087e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 442.75, "completions/min_length": 407.0, "epoch": 8.266176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.016645187512040138, "kl": 0.009064404875971377, "learning_rate": 7.296212274139701e-07, "loss": 9.032270463649184e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 429.375, "completions/min_length": 368.0, "epoch": 8.26764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.017293039709329605, "kl": 0.01068634632974863, "learning_rate": 7.295072202114012e-07, "loss": 0.00010742578888311982, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 431.3125, "completions/min_length": 401.0, "epoch": 8.269117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.04079284518957138, "kl": 0.010943172266706824, "learning_rate": 7.293931978894774e-07, "loss": 0.00010974745237035677, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 442.875, "completions/min_length": 404.0, "epoch": 8.270588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.03315997123718262, "kl": 0.01095177314709872, "learning_rate": 7.292791604557101e-07, "loss": 0.0001103500762837939, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/mean_length": 537.9375, "completions/min_length": 476.0, "epoch": 8.272058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.011027591302990913, "kl": 0.007920240866951644, "learning_rate": 7.291651079176121e-07, "loss": 7.89811383583583e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 5625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 417.6875, "completions/min_length": 376.0, "epoch": 8.273529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.013115448877215385, "kl": 0.009969000006094575, "learning_rate": 7.290510402826966e-07, "loss": 9.865947504295036e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 426.4375, "completions/min_length": 370.0, "epoch": 8.275, "frac_reward_zero_std": 1.0, "grad_norm": 0.01945226825773716, "kl": 0.01029276754707098, "learning_rate": 7.289369575584783e-07, "loss": 0.0001019152914523147, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/mean_length": 502.8125, "completions/min_length": 404.0, "epoch": 8.276470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.8784236907958984, "kl": 0.013870750088244677, "learning_rate": 7.288228597524727e-07, "loss": 0.00013997405767440796, "reward": 0.8946042060852051, "reward_std": 0.08972080051898956, "rewards/DrugCombAccuracyCOTORM/mean": 0.8702083826065063, "rewards/DrugCombAccuracyCOTORM/std": 0.20277051627635956, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.042695630341768265, "step": 5628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 460.5625, "completions/min_length": 392.0, "epoch": 8.277941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.8335229754447937, "kl": 0.010672940756194293, "learning_rate": 7.28708746872196e-07, "loss": 0.00010654330253601074, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 425.625, "completions/min_length": 378.0, "epoch": 8.279411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.0310127604752779, "kl": 0.010976245161145926, "learning_rate": 7.28594618925166e-07, "loss": 0.00010950180876534432, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 482.25, "completions/min_length": 415.0, "epoch": 8.280882352941177, "frac_reward_zero_std": 0.0, "grad_norm": 1.3416181802749634, "kl": 0.013022006722167134, "learning_rate": 7.284804759189009e-07, "loss": 0.0001307949423789978, "reward": 0.7026666402816772, "reward_std": 0.32906854152679443, "rewards/DrugCombAccuracyCOTORM/mean": 0.6387500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.4844498634338379, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 5631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 476.875, "completions/min_length": 404.0, "epoch": 8.282352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.9027932286262512, "kl": 0.009026294923387468, "learning_rate": 7.283663178609202e-07, "loss": 9.030477667693049e-05, "reward": 0.6143541932106018, "reward_std": 0.031053775921463966, "rewards/DrugCombAccuracyCOTORM/mean": 0.5641666650772095, "rewards/DrugCombAccuracyCOTORM/std": 0.4515184462070465, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6458333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.37453675270080566, "step": 5632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 437.125, "completions/min_length": 378.0, "epoch": 8.283823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.014318699017167091, "kl": 0.010075597325339913, "learning_rate": 7.282521447587445e-07, "loss": 0.00010060907516162843, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 451.8125, "completions/min_length": 396.0, "epoch": 8.285294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.026205068454146385, "kl": 0.012279967777431011, "learning_rate": 7.281379566198953e-07, "loss": 0.0001240887213498354, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 501.0625, "completions/min_length": 440.0, "epoch": 8.286764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.2819005250930786, "kl": 0.010346760973334312, "learning_rate": 7.280237534518947e-07, "loss": 0.00010408461093902588, "reward": 0.8053333163261414, "reward_std": 0.3479629456996918, "rewards/DrugCombAccuracyCOTORM/mean": 0.7775000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.4020530879497528, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5018484592437744, "step": 5635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 460.3125, "completions/min_length": 370.0, "epoch": 8.288235294117648, "frac_reward_zero_std": 0.5, "grad_norm": 0.7599115967750549, "kl": 0.008143550832755864, "learning_rate": 7.279095352622662e-07, "loss": 8.13361257314682e-05, "reward": 0.887499988079071, "reward_std": 0.18077215552330017, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.28867512941360474, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 485.0625, "completions/min_length": 414.0, "epoch": 8.28970588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.03973947837948799, "kl": 0.012917739106342196, "learning_rate": 7.277953020585344e-07, "loss": 0.00012971166870556772, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 432.0, "completions/min_length": 377.0, "epoch": 8.291176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.09171479195356369, "kl": 0.01000433973968029, "learning_rate": 7.276810538482244e-07, "loss": 9.989632235374302e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 470.75, "completions/min_length": 411.0, "epoch": 8.29264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.018653064966201782, "kl": 0.010053402511402965, "learning_rate": 7.275667906388629e-07, "loss": 0.0001009129045996815, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 445.0625, "completions/min_length": 405.0, "epoch": 8.294117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.019625263288617134, "kl": 0.00888153223786503, "learning_rate": 7.274525124379773e-07, "loss": 8.893088670447469e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/mean_length": 465.375, "completions/min_length": 372.0, "epoch": 8.295588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.8219600319862366, "kl": 0.009414864936843514, "learning_rate": 7.273382192530956e-07, "loss": 9.45786086958833e-05, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 5641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 462.4375, "completions/min_length": 381.0, "epoch": 8.297058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.2191197872161865, "kl": 0.013026109198108315, "learning_rate": 7.272239110917473e-07, "loss": 0.00013078376650810242, "reward": 0.40000003576278687, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 5642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 480.375, "completions/min_length": 439.0, "epoch": 8.298529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.8405719995498657, "kl": 0.0151841442566365, "learning_rate": 7.271095879614628e-07, "loss": 0.00015103071928024292, "reward": 0.8401666879653931, "reward_std": 0.35700300335884094, "rewards/DrugCombAccuracyCOTORM/mean": 0.8262500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.37427040934562683, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5288001894950867, "step": 5643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 474.4375, "completions/min_length": 420.0, "epoch": 8.3, "frac_reward_zero_std": 0.0, "grad_norm": 1.6818957328796387, "kl": 0.015215616440400481, "learning_rate": 7.269952498697734e-07, "loss": 0.00015231408178806305, "reward": 0.3803333342075348, "reward_std": 0.42074722051620483, "rewards/DrugCombAccuracyCOTORM/mean": 0.3400000035762787, "rewards/DrugCombAccuracyCOTORM/std": 0.4652741253376007, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0833333432674408, "rewards/DrugCombCoverageCOTORM/std": 0.9925649762153625, "step": 5644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 435.125, "completions/min_length": 379.0, "epoch": 8.301470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 0.9312141537666321, "kl": 0.008736886200495064, "learning_rate": 7.268808968242114e-07, "loss": 8.730177069082856e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 403.3125, "completions/min_length": 344.0, "epoch": 8.302941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9481414556503296, "kl": 0.011393848108127713, "learning_rate": 7.267665288323101e-07, "loss": 0.00011366605758666992, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 444.625, "completions/min_length": 360.0, "epoch": 8.304411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.015545595437288284, "kl": 0.010891849175095558, "learning_rate": 7.266521459016037e-07, "loss": 0.0001082474846043624, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 445.6875, "completions/min_length": 410.0, "epoch": 8.305882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.01673731952905655, "kl": 0.00969934486784041, "learning_rate": 7.265377480396277e-07, "loss": 9.793198114493862e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 476.4375, "completions/min_length": 403.0, "epoch": 8.30735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 2.3764829635620117, "kl": 0.037250258726999164, "learning_rate": 7.264233352539178e-07, "loss": 0.00037377432454377413, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/mean_length": 533.125, "completions/min_length": 437.0, "epoch": 8.308823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.7540903091430664, "kl": 0.008886349853128195, "learning_rate": 7.26308907552012e-07, "loss": 8.964780136011541e-05, "reward": 0.7536611557006836, "reward_std": 0.14032304286956787, "rewards/DrugCombAccuracyCOTORM/mean": 0.7172499895095825, "rewards/DrugCombAccuracyCOTORM/std": 0.37681251764297485, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7986111640930176, "rewards/DrugCombCoverageCOTORM/std": 0.22669117152690887, "step": 5650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 463.0, "completions/min_length": 426.0, "epoch": 8.310294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.011653502471745014, "kl": 0.009627972031012177, "learning_rate": 7.261944649414478e-07, "loss": 9.606721869204193e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 440.4375, "completions/min_length": 390.0, "epoch": 8.311764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.01385433692485094, "kl": 0.010521172545850277, "learning_rate": 7.260800074297647e-07, "loss": 0.00010550515435170382, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 447.3125, "completions/min_length": 400.0, "epoch": 8.313235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 1.069645881652832, "kl": 0.010929072741419077, "learning_rate": 7.25965535024503e-07, "loss": 0.00010750815272331238, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 5653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 477.4375, "completions/min_length": 455.0, "epoch": 8.314705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.012923702597618103, "kl": 0.006710296613164246, "learning_rate": 7.258510477332035e-07, "loss": 6.710637535434216e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/mean_length": 552.75, "completions/min_length": 449.0, "epoch": 8.316176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.7568960785865784, "kl": 0.010769605287350714, "learning_rate": 7.257365455634086e-07, "loss": 0.00010481766366865486, "reward": 0.791523277759552, "reward_std": 0.13746900856494904, "rewards/DrugCombAccuracyCOTORM/mean": 0.7466957569122314, "rewards/DrugCombAccuracyCOTORM/std": 0.34304577112197876, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9416666626930237, "rewards/DrugCombCoverageCOTORM/std": 0.1324973851442337, "step": 5655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/mean_length": 505.125, "completions/min_length": 400.0, "epoch": 8.31764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.04783034324646, "kl": 0.010889894561842084, "learning_rate": 7.256220285226614e-07, "loss": 0.00010963529348373413, "reward": 0.5291110873222351, "reward_std": 0.04216922074556351, "rewards/DrugCombAccuracyCOTORM/mean": 0.5068749785423279, "rewards/DrugCombAccuracyCOTORM/std": 0.5099897980690002, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.2361111044883728, "rewards/DrugCombCoverageCOTORM/std": 0.9061972498893738, "step": 5656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 432.5, "completions/min_length": 361.0, "epoch": 8.319117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.9962422251701355, "kl": 0.010041916044428945, "learning_rate": 7.25507496618506e-07, "loss": 0.00010035187005996704, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/mean_length": 414.0625, "completions/min_length": 387.0, "epoch": 8.320588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.01254450436681509, "kl": 0.010617933701723814, "learning_rate": 7.253929498584873e-07, "loss": 0.00010621862020343542, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/mean_length": 501.5625, "completions/min_length": 384.0, "epoch": 8.322058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.012210703454911709, "kl": 0.008303064736537635, "learning_rate": 7.252783882501514e-07, "loss": 8.263711060862988e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 434.0625, "completions/min_length": 391.0, "epoch": 8.323529411764707, "frac_reward_zero_std": 0.0, "grad_norm": 1.4596906900405884, "kl": 0.015226417919620872, "learning_rate": 7.251638118010456e-07, "loss": 0.0001509338617324829, "reward": 0.59375, "reward_std": 0.3005203902721405, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 503.875, "completions/min_length": 435.0, "epoch": 8.325, "frac_reward_zero_std": 1.0, "grad_norm": 0.015170961618423462, "kl": 0.011893266811966896, "learning_rate": 7.250492205187176e-07, "loss": 0.0001186955050798133, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 429.0, "completions/min_length": 351.0, "epoch": 8.326470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9255498051643372, "kl": 0.0094840950332582, "learning_rate": 7.249346144107164e-07, "loss": 9.528547525405884e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 421.375, "completions/min_length": 362.0, "epoch": 8.327941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.018669314682483673, "kl": 0.010842392686754465, "learning_rate": 7.24819993484592e-07, "loss": 0.00010933383600786328, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 460.8125, "completions/min_length": 410.0, "epoch": 8.329411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.102624773979187, "kl": 0.011610919144004583, "learning_rate": 7.247053577478954e-07, "loss": 0.00011561355495359749, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 388.9375, "completions/min_length": 329.0, "epoch": 8.330882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.4797027111053467, "kl": 0.01084361900575459, "learning_rate": 7.245907072081787e-07, "loss": 0.00010786805069074035, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 493.125, "completions/min_length": 426.0, "epoch": 8.33235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0915839672088623, "kl": 0.01472764927893877, "learning_rate": 7.244760418729944e-07, "loss": 0.0001454491721233353, "reward": 0.6828303337097168, "reward_std": 0.1618662029504776, "rewards/DrugCombAccuracyCOTORM/mean": 0.6080952286720276, "rewards/DrugCombAccuracyCOTORM/std": 0.49096694588661194, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9635416865348816, "rewards/DrugCombCoverageCOTORM/std": 0.10077822208404541, "step": 5666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 410.75, "completions/min_length": 380.0, "epoch": 8.333823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.010175080969929695, "kl": 0.009381098905578256, "learning_rate": 7.243613617498965e-07, "loss": 9.378642425872386e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 488.8125, "completions/min_length": 436.0, "epoch": 8.33529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.7820096611976624, "kl": 0.01019534410443157, "learning_rate": 7.242466668464401e-07, "loss": 0.0001015649177134037, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 476.875, "completions/min_length": 415.0, "epoch": 8.336764705882352, "frac_reward_zero_std": 1.0, "grad_norm": 0.009629384614527225, "kl": 0.008115090080536902, "learning_rate": 7.241319571701806e-07, "loss": 8.126227476168424e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 442.4375, "completions/min_length": 374.0, "epoch": 8.338235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1635748147964478, "kl": 0.010820459690876305, "learning_rate": 7.24017232728675e-07, "loss": 0.0001068328128894791, "reward": 0.9750000238418579, "reward_std": 0.0707106739282608, "rewards/DrugCombAccuracyCOTORM/mean": 0.96875, "rewards/DrugCombAccuracyCOTORM/std": 0.125, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 431.25, "completions/min_length": 385.0, "epoch": 8.339705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.1267218589782715, "kl": 0.011871496913954616, "learning_rate": 7.23902493529481e-07, "loss": 0.00011927106970688328, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 505.875, "completions/min_length": 445.0, "epoch": 8.341176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.2590301036834717, "kl": 0.011439583031460643, "learning_rate": 7.237877395801575e-07, "loss": 0.0001140260137617588, "reward": 0.9051250219345093, "reward_std": 0.17601576447486877, "rewards/DrugCombAccuracyCOTORM/mean": 0.8853124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.314830482006073, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 5672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 459.5, "completions/min_length": 368.0, "epoch": 8.342647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.29982590675354, "kl": 0.013312492985278368, "learning_rate": 7.236729708882638e-07, "loss": 0.00013162195682525635, "reward": 0.543749988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 475.4375, "completions/min_length": 405.0, "epoch": 8.344117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.020509712398052216, "kl": 0.011509850155562162, "learning_rate": 7.235581874613612e-07, "loss": 0.0001150804091594182, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 427.375, "completions/min_length": 398.0, "epoch": 8.345588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.008347193710505962, "kl": 0.007766445982269943, "learning_rate": 7.234433893070107e-07, "loss": 7.753611134830862e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/mean_length": 489.5, "completions/min_length": 393.0, "epoch": 8.347058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.7789658308029175, "kl": 0.011178823886439204, "learning_rate": 7.233285764327751e-07, "loss": 0.00011276811710558832, "reward": 0.668218731880188, "reward_std": 0.007357499562203884, "rewards/DrugCombAccuracyCOTORM/mean": 0.6071763634681702, "rewards/DrugCombAccuracyCOTORM/std": 0.40580886602401733, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8247767686843872, "rewards/DrugCombCoverageCOTORM/std": 0.18306474387645721, "step": 5676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 429.8125, "completions/min_length": 378.0, "epoch": 8.348529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.0734467506408691, "kl": 0.009925266145728528, "learning_rate": 7.232137488462182e-07, "loss": 9.957427391782403e-05, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 450.6875, "completions/min_length": 390.0, "epoch": 8.35, "frac_reward_zero_std": 0.5, "grad_norm": 0.8824604749679565, "kl": 0.012223766301758587, "learning_rate": 7.230989065549044e-07, "loss": 0.00012207901454530656, "reward": 0.9551249742507935, "reward_std": 0.12692566215991974, "rewards/DrugCombAccuracyCOTORM/mean": 0.9478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.20874999463558197, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 5678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 438.6875, "completions/min_length": 374.0, "epoch": 8.351470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.018192362040281296, "kl": 0.008426078711636364, "learning_rate": 7.229840495663991e-07, "loss": 8.463850826956332e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 432.6875, "completions/min_length": 375.0, "epoch": 8.352941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.01869773678481579, "kl": 0.008417400647886097, "learning_rate": 7.228691778882692e-07, "loss": 8.491834159940481e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 415.125, "completions/min_length": 364.0, "epoch": 8.354411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.026036059483885765, "kl": 0.010936686070635915, "learning_rate": 7.227542915280816e-07, "loss": 0.00010994699550792575, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 484.625, "completions/min_length": 428.0, "epoch": 8.355882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.021150147542357445, "kl": 0.010578445042483509, "learning_rate": 7.226393904934051e-07, "loss": 0.00010550465231062844, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 440.25, "completions/min_length": 388.0, "epoch": 8.35735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9298743605613708, "kl": 0.009617002098821104, "learning_rate": 7.225244747918089e-07, "loss": 9.675323963165283e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 472.3125, "completions/min_length": 414.0, "epoch": 8.358823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.014408239163458347, "kl": 0.009734432213008404, "learning_rate": 7.224095444308636e-07, "loss": 9.747120930114761e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 430.9375, "completions/min_length": 391.0, "epoch": 8.360294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 1.0286190509796143, "kl": 0.011493354104459286, "learning_rate": 7.222945994181402e-07, "loss": 0.0001145973801612854, "reward": 0.4937500059604645, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 5685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 505.25, "completions/min_length": 444.0, "epoch": 8.361764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.28377103805542, "kl": 0.013001821236684918, "learning_rate": 7.221796397612113e-07, "loss": 0.00013006478548049927, "reward": 0.6375000476837158, "reward_std": 0.4051477909088135, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 437.1875, "completions/min_length": 394.0, "epoch": 8.363235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.014227453619241714, "kl": 0.00996724097058177, "learning_rate": 7.220646654676498e-07, "loss": 9.986834629671648e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 433.4375, "completions/min_length": 369.0, "epoch": 8.364705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 0.8843666315078735, "kl": 0.009460341068916023, "learning_rate": 7.219496765450304e-07, "loss": 9.562302875565365e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 496.5, "completions/min_length": 427.0, "epoch": 8.366176470588234, "frac_reward_zero_std": 0.0, "grad_norm": 1.5175480842590332, "kl": 0.013319001765921712, "learning_rate": 7.218346730009278e-07, "loss": 0.00013436377048492432, "reward": 0.18183332681655884, "reward_std": 0.21442942321300507, "rewards/DrugCombAccuracyCOTORM/mean": 0.07625000178813934, "rewards/DrugCombAccuracyCOTORM/std": 0.2523720860481262, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.2083333432674408, "rewards/DrugCombCoverageCOTORM/std": 0.6070572733879089, "step": 5689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 441.5625, "completions/min_length": 382.0, "epoch": 8.367647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.019370855763554573, "kl": 0.01108604553155601, "learning_rate": 7.217196548429183e-07, "loss": 0.0001102807727875188, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/mean_length": 489.125, "completions/min_length": 439.0, "epoch": 8.369117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.0802417993545532, "kl": 0.012771458365023136, "learning_rate": 7.216046220785792e-07, "loss": 0.00012872068327851593, "reward": 0.9291666746139526, "reward_std": 0.07572401314973831, "rewards/DrugCombAccuracyCOTORM/mean": 0.9166666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.14907118678092957, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.07453560829162598, "step": 5691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/mean_length": 533.125, "completions/min_length": 429.0, "epoch": 8.370588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.3082568645477295, "kl": 0.010684463893994689, "learning_rate": 7.214895747154883e-07, "loss": 0.00010786950588226318, "reward": 0.7108333110809326, "reward_std": 0.32050198316574097, "rewards/DrugCombAccuracyCOTORM/mean": 0.663095235824585, "rewards/DrugCombAccuracyCOTORM/std": 0.4153003692626953, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8035714626312256, "rewards/DrugCombCoverageCOTORM/std": 0.29450756311416626, "step": 5692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/mean_length": 515.5, "completions/min_length": 406.0, "epoch": 8.37205882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.0870846509933472, "kl": 0.011817733058705926, "learning_rate": 7.213745127612248e-07, "loss": 0.00011829247523564845, "reward": 0.8310055732727051, "reward_std": 0.15555411577224731, "rewards/DrugCombAccuracyCOTORM/mean": 0.792229175567627, "rewards/DrugCombAccuracyCOTORM/std": 0.34045252203941345, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9722222089767456, "rewards/DrugCombCoverageCOTORM/std": 0.060858070850372314, "step": 5693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/mean_length": 490.4375, "completions/min_length": 422.0, "epoch": 8.373529411764705, "frac_reward_zero_std": 0.0, "grad_norm": 1.4167733192443848, "kl": 0.010623016278259456, "learning_rate": 7.212594362233686e-07, "loss": 0.00010547786951065063, "reward": 0.5083333849906921, "reward_std": 0.4306259751319885, "rewards/DrugCombAccuracyCOTORM/mean": 0.4791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.5013870000839233, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.25, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 5694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 446.4375, "completions/min_length": 409.0, "epoch": 8.375, "frac_reward_zero_std": 0.5, "grad_norm": 1.3428620100021362, "kl": 0.00972790562082082, "learning_rate": 7.211443451095006e-07, "loss": 9.655952453613281e-05, "reward": 0.9937499761581421, "reward_std": 0.017677659168839455, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 482.875, "completions/min_length": 407.0, "epoch": 8.376470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.9276728630065918, "kl": 0.008410421898588538, "learning_rate": 7.210292394272028e-07, "loss": 8.426618296653032e-05, "reward": 0.909250020980835, "reward_std": 0.1701066941022873, "rewards/DrugCombAccuracyCOTORM/mean": 0.8904687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.30268827080726624, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.08539126068353653, "step": 5696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 473.125, "completions/min_length": 424.0, "epoch": 8.37794117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.8469406366348267, "kl": 0.009722839808091521, "learning_rate": 7.209141191840582e-07, "loss": 9.687617421150208e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 446.1875, "completions/min_length": 393.0, "epoch": 8.379411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.010815559886395931, "kl": 0.009116058237850666, "learning_rate": 7.207989843876504e-07, "loss": 9.106068318942562e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 447.6875, "completions/min_length": 385.0, "epoch": 8.380882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.009811479598283768, "kl": 0.00845218903850764, "learning_rate": 7.206838350455643e-07, "loss": 8.469910244457424e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 485.9375, "completions/min_length": 416.0, "epoch": 8.382352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.1554492712020874, "kl": 0.014132744865491986, "learning_rate": 7.205686711653855e-07, "loss": 0.00014122230641078204, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 5700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 426.375, "completions/min_length": 391.0, "epoch": 8.383823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.012539057992398739, "kl": 0.009060418116860092, "learning_rate": 7.20453492754701e-07, "loss": 9.058111754711717e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 454.9375, "completions/min_length": 362.0, "epoch": 8.385294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 1.0742099285125732, "kl": 0.009803376044146717, "learning_rate": 7.203382998210983e-07, "loss": 9.923068864736706e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 5702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 420.1875, "completions/min_length": 359.0, "epoch": 8.386764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.009748287498950958, "kl": 0.006763318087905645, "learning_rate": 7.20223092372166e-07, "loss": 6.771340849809349e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 476.5625, "completions/min_length": 404.0, "epoch": 8.388235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.002551555633545, "kl": 0.010239178547635674, "learning_rate": 7.201078704154937e-07, "loss": 0.00010260194540023804, "reward": 0.44999998807907104, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 5704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 477.5, "completions/min_length": 398.0, "epoch": 8.389705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.023195572197437286, "kl": 0.00937990634702146, "learning_rate": 7.199926339586718e-07, "loss": 9.356014197692275e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 458.5, "completions/min_length": 391.0, "epoch": 8.391176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.012247521430253983, "kl": 0.009045132203027606, "learning_rate": 7.19877383009292e-07, "loss": 9.05894412426278e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 430.5625, "completions/min_length": 406.0, "epoch": 8.39264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.012008070014417171, "kl": 0.008869029930792749, "learning_rate": 7.197621175749467e-07, "loss": 8.852174505591393e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 460.5625, "completions/min_length": 422.0, "epoch": 8.394117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.013543530367314816, "kl": 0.009632978239096701, "learning_rate": 7.19646837663229e-07, "loss": 9.58335876930505e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 388.8125, "completions/min_length": 309.0, "epoch": 8.395588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.100691318511963, "kl": 0.010511782253161073, "learning_rate": 7.195315432817337e-07, "loss": 0.00010532140731811523, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/mean_length": 535.9375, "completions/min_length": 462.0, "epoch": 8.397058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 1.108932614326477, "kl": 0.00992441875860095, "learning_rate": 7.19416234438056e-07, "loss": 9.941741882357746e-05, "reward": 0.45000001788139343, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 5710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 452.9375, "completions/min_length": 378.0, "epoch": 8.398529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9897128343582153, "kl": 0.012792409164831042, "learning_rate": 7.193009111397918e-07, "loss": 0.00012803822755813599, "reward": 0.543749988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 427.8125, "completions/min_length": 345.0, "epoch": 8.4, "frac_reward_zero_std": 1.0, "grad_norm": 0.015626098960638046, "kl": 0.011831334326416254, "learning_rate": 7.191855733945386e-07, "loss": 0.00011688774975482374, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/mean_length": 404.5625, "completions/min_length": 347.0, "epoch": 8.401470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.014705010689795017, "kl": 0.008594221202656627, "learning_rate": 7.190702212098947e-07, "loss": 8.588933997089043e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 459.875, "completions/min_length": 400.0, "epoch": 8.402941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.012494685128331184, "kl": 0.008677989128045738, "learning_rate": 7.189548545934589e-07, "loss": 8.670661190990359e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 5714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 433.4375, "completions/min_length": 387.0, "epoch": 8.404411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.021574776619672775, "kl": 0.010302209295332432, "learning_rate": 7.188394735528312e-07, "loss": 0.00010287045006407425, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 5715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/mean_length": 540.125, "completions/min_length": 479.0, "epoch": 8.405882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.9437121152877808, "kl": 0.008708130568265915, "learning_rate": 7.187240780956132e-07, "loss": 8.642649481771514e-05, "reward": 0.942187488079071, "reward_std": 0.16351844370365143, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/mean_length": 408.625, "completions/min_length": 362.0, "epoch": 8.407352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.3744487762451172, "kl": 0.011914256028831005, "learning_rate": 7.186086682294062e-07, "loss": 0.0001182384294224903, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 427.6875, "completions/min_length": 338.0, "epoch": 8.408823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.012790213339030743, "kl": 0.008986509521491826, "learning_rate": 7.184932439618135e-07, "loss": 9.01709936442785e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 483.875, "completions/min_length": 426.0, "epoch": 8.410294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.6999908685684204, "kl": 0.02345931949093938, "learning_rate": 7.183778053004386e-07, "loss": 0.00022692001948598772, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 402.4375, "completions/min_length": 356.0, "epoch": 8.411764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.01168015506118536, "kl": 0.009032190078869462, "learning_rate": 7.182623522528865e-07, "loss": 8.887544390745461e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 434.25, "completions/min_length": 379.0, "epoch": 8.413235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.021244972944259644, "kl": 0.0106681645847857, "learning_rate": 7.181468848267632e-07, "loss": 0.00010570940503384918, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 460.125, "completions/min_length": 384.0, "epoch": 8.41470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.016493767499923706, "kl": 0.012090768432244658, "learning_rate": 7.180314030296752e-07, "loss": 0.0001200975093524903, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 520.625, "completions/min_length": 442.0, "epoch": 8.416176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.2770237922668457, "kl": 0.01535532483831048, "learning_rate": 7.179159068692302e-07, "loss": 0.0001522467064205557, "reward": 0.6886261105537415, "reward_std": 0.1649780124425888, "rewards/DrugCombAccuracyCOTORM/mean": 0.6357825994491577, "rewards/DrugCombAccuracyCOTORM/std": 0.4578479528427124, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.800000011920929, "rewards/DrugCombCoverageCOTORM/std": 0.27325204014778137, "step": 5723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 421.125, "completions/min_length": 351.0, "epoch": 8.41764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8995545506477356, "kl": 0.010767016327008605, "learning_rate": 7.178003963530365e-07, "loss": 0.00010657874372554943, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 439.125, "completions/min_length": 384.0, "epoch": 8.419117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.03140941262245178, "kl": 0.010191305191256106, "learning_rate": 7.17684871488704e-07, "loss": 0.00010165365529246628, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 417.125, "completions/min_length": 340.0, "epoch": 8.420588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.0425868034362793, "kl": 0.011515631573274732, "learning_rate": 7.17569332283843e-07, "loss": 0.00011555185483302921, "reward": 0.9619500041007996, "reward_std": 0.10762164741754532, "rewards/DrugCombAccuracyCOTORM/mean": 0.9539999961853027, "rewards/DrugCombAccuracyCOTORM/std": 0.18400000035762787, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.987500011920929, "rewards/DrugCombCoverageCOTORM/std": 0.05000000074505806, "step": 5726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 420.125, "completions/min_length": 397.0, "epoch": 8.422058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.8831822872161865, "kl": 0.01350751519203186, "learning_rate": 7.174537787460651e-07, "loss": 0.00013437867164611816, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 5727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 456.75, "completions/min_length": 397.0, "epoch": 8.423529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.016938094049692154, "kl": 0.0073332153260707855, "learning_rate": 7.173382108829825e-07, "loss": 7.320211443584412e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/mean_length": 511.875, "completions/min_length": 390.0, "epoch": 8.425, "frac_reward_zero_std": 0.5, "grad_norm": 0.8813403844833374, "kl": 0.007890596985816956, "learning_rate": 7.172226287022085e-07, "loss": 8.062810229603201e-05, "reward": 0.8475000262260437, "reward_std": 0.21049262583255768, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9750000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.06831300258636475, "step": 5729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 414.375, "completions/min_length": 329.0, "epoch": 8.426470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.012008745223283768, "kl": 0.009479578817263246, "learning_rate": 7.171070322113575e-07, "loss": 9.328401938546449e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 502.5625, "completions/min_length": 446.0, "epoch": 8.427941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.026862364262342453, "kl": 0.009978122543543577, "learning_rate": 7.169914214180447e-07, "loss": 0.00010025162191595882, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 434.625, "completions/min_length": 393.0, "epoch": 8.429411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.0801973342895508, "kl": 0.012667203322052956, "learning_rate": 7.168757963298861e-07, "loss": 0.00012531131505966187, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 466.0625, "completions/min_length": 430.0, "epoch": 8.430882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.052606601268053055, "kl": 0.013909546774812043, "learning_rate": 7.167601569544989e-07, "loss": 0.00013889250112697482, "reward": 0.800000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 456.625, "completions/min_length": 392.0, "epoch": 8.43235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0039715766906738, "kl": 0.008834082982502878, "learning_rate": 7.166445032995012e-07, "loss": 8.823292591841891e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 430.375, "completions/min_length": 352.0, "epoch": 8.433823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.7815461754798889, "kl": 0.009593777707777917, "learning_rate": 7.165288353725119e-07, "loss": 9.50083413044922e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 495.5625, "completions/min_length": 427.0, "epoch": 8.435294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.4324016571044922, "kl": 0.013043803861364722, "learning_rate": 7.164131531811508e-07, "loss": 0.00013090670108795166, "reward": 0.7124999761581421, "reward_std": 0.2505747675895691, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/mean_length": 506.9375, "completions/min_length": 444.0, "epoch": 8.436764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 0.9735100269317627, "kl": 0.009723689057864249, "learning_rate": 7.162974567330388e-07, "loss": 9.745359420776367e-05, "reward": 0.6371666789054871, "reward_std": 0.16668714582920074, "rewards/DrugCombAccuracyCOTORM/mean": 0.6037499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.4699627757072449, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5416666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.7781745791435242, "step": 5737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 451.875, "completions/min_length": 381.0, "epoch": 8.438235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 1.1362299919128418, "kl": 0.011705259094014764, "learning_rate": 7.161817460357978e-07, "loss": 0.00011695176362991333, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 464.8125, "completions/min_length": 372.0, "epoch": 8.439705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.012659295462071896, "kl": 0.008133704075589776, "learning_rate": 7.160660210970504e-07, "loss": 8.152039663400501e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 421.9375, "completions/min_length": 346.0, "epoch": 8.441176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.9045212268829346, "kl": 0.00982010760344565, "learning_rate": 7.159502819244205e-07, "loss": 9.684140968602151e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/mean_length": 496.375, "completions/min_length": 396.0, "epoch": 8.44264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.7760012745857239, "kl": 0.010758640244603157, "learning_rate": 7.158345285255324e-07, "loss": 0.0001084059476852417, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 5741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 442.625, "completions/min_length": 369.0, "epoch": 8.444117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.2409648895263672, "kl": 0.015097017982043326, "learning_rate": 7.157187609080118e-07, "loss": 0.00014898180961608887, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/mean_length": 507.375, "completions/min_length": 407.0, "epoch": 8.445588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.869549036026001, "kl": 0.012022570008412004, "learning_rate": 7.156029790794851e-07, "loss": 0.00012217462062835693, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 5743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 451.1875, "completions/min_length": 409.0, "epoch": 8.447058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.02326263301074505, "kl": 0.011165020288899541, "learning_rate": 7.154871830475797e-07, "loss": 0.00011148870544275269, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/mean_length": 506.5, "completions/min_length": 446.0, "epoch": 8.448529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 0.9098938703536987, "kl": 0.01329533546231687, "learning_rate": 7.153713728199241e-07, "loss": 0.0001338766742264852, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 5745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 462.5, "completions/min_length": 403.0, "epoch": 8.45, "frac_reward_zero_std": 1.0, "grad_norm": 0.05595838278532028, "kl": 0.011122491443529725, "learning_rate": 7.152555484041475e-07, "loss": 0.00011242514301557094, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 464.0, "completions/min_length": 434.0, "epoch": 8.451470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.3404080867767334, "kl": 0.007759460946545005, "learning_rate": 7.151397098078802e-07, "loss": 7.765740156173706e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 458.3125, "completions/min_length": 364.0, "epoch": 8.452941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.8614286780357361, "kl": 0.011625546729192138, "learning_rate": 7.150238570387531e-07, "loss": 0.00011499226093292236, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/mean_length": 515.6875, "completions/min_length": 452.0, "epoch": 8.454411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 1.3006070852279663, "kl": 0.011642619036138058, "learning_rate": 7.149079901043986e-07, "loss": 0.0001168176531791687, "reward": 0.5552083253860474, "reward_std": 0.4404778778553009, "rewards/DrugCombAccuracyCOTORM/mean": 0.4791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.5013870000839233, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.71875, "rewards/DrugCombCoverageCOTORM/std": 0.682367205619812, "step": 5749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 388.0, "completions/min_length": 355.0, "epoch": 8.455882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.008589260280132294, "kl": 0.0073155309073626995, "learning_rate": 7.147921090124495e-07, "loss": 7.335354166571051e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 5750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/mean_length": 498.5625, "completions/min_length": 404.0, "epoch": 8.45735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.086269736289978, "kl": 0.010155548341572285, "learning_rate": 7.146762137705397e-07, "loss": 0.00010140985250473022, "reward": 0.6499791741371155, "reward_std": 0.059433210641145706, "rewards/DrugCombAccuracyCOTORM/mean": 0.5839583277702332, "rewards/DrugCombAccuracyCOTORM/std": 0.4407220184803009, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.828125, "rewards/DrugCombCoverageCOTORM/std": 0.193574458360672, "step": 5751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 438.0625, "completions/min_length": 376.0, "epoch": 8.458823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.922283947467804, "kl": 0.0090870491694659, "learning_rate": 7.145603043863044e-07, "loss": 9.104407945415005e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 468.75, "completions/min_length": 438.0, "epoch": 8.46029411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.010752161033451557, "kl": 0.007500902866013348, "learning_rate": 7.144443808673793e-07, "loss": 7.455125160049647e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 462.8125, "completions/min_length": 405.0, "epoch": 8.461764705882352, "frac_reward_zero_std": 1.0, "grad_norm": 0.02725124917924404, "kl": 0.012947614537551999, "learning_rate": 7.14328443221401e-07, "loss": 0.00012990286631975323, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 421.8125, "completions/min_length": 375.0, "epoch": 8.463235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0363000631332397, "kl": 0.010033076861873269, "learning_rate": 7.142124914560071e-07, "loss": 0.00010044872760772705, "reward": 0.5589166879653931, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.45125001668930054, "rewards/DrugCombAccuracyCOTORM/std": 0.502684473991394, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 5755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 454.75, "completions/min_length": 402.0, "epoch": 8.464705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.8915908932685852, "kl": 0.010922883404418826, "learning_rate": 7.140965255788364e-07, "loss": 0.00010939584171865135, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 496.375, "completions/min_length": 428.0, "epoch": 8.466176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.02286958694458, "kl": 0.012752153677865863, "learning_rate": 7.139805455975285e-07, "loss": 0.00012910921941511333, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 5757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 463.9375, "completions/min_length": 409.0, "epoch": 8.467647058823529, "frac_reward_zero_std": 0.0, "grad_norm": 1.4874285459518433, "kl": 0.01313444273546338, "learning_rate": 7.13864551519724e-07, "loss": 0.00013143569231033325, "reward": 0.6000000238418579, "reward_std": 0.4742809236049652, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 5758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 426.625, "completions/min_length": 376.0, "epoch": 8.469117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.031103629618883133, "kl": 0.011023770086467266, "learning_rate": 7.137485433530638e-07, "loss": 0.00011022544640582055, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 472.125, "completions/min_length": 421.0, "epoch": 8.470588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.02536272443830967, "kl": 0.010578776709735394, "learning_rate": 7.136325211051904e-07, "loss": 0.00010640429536579177, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 484.5625, "completions/min_length": 428.0, "epoch": 8.472058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.9196327924728394, "kl": 0.014358701882883906, "learning_rate": 7.135164847837474e-07, "loss": 0.0001463327935198322, "reward": 0.6485832929611206, "reward_std": 0.0324070081114769, "rewards/DrugCombAccuracyCOTORM/mean": 0.5893750190734863, "rewards/DrugCombAccuracyCOTORM/std": 0.4259871244430542, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7708333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.266109824180603, "step": 5761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 451.0, "completions/min_length": 363.0, "epoch": 8.473529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.02152286469936371, "kl": 0.01096751308068633, "learning_rate": 7.134004343963785e-07, "loss": 0.00011073108180426061, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 5762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 461.1875, "completions/min_length": 416.0, "epoch": 8.475, "frac_reward_zero_std": 0.5, "grad_norm": 0.8291409611701965, "kl": 0.009226203663274646, "learning_rate": 7.132843699507291e-07, "loss": 9.237779886461794e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 466.625, "completions/min_length": 401.0, "epoch": 8.476470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.014084131456911564, "kl": 0.011376392561942339, "learning_rate": 7.131682914544452e-07, "loss": 0.00011415145127102733, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/mean_length": 574.0, "completions/min_length": 454.0, "epoch": 8.477941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 0.8324857950210571, "kl": 0.01110875466838479, "learning_rate": 7.130521989151738e-07, "loss": 0.0001105223927879706, "reward": 0.5694444179534912, "reward_std": 0.0240677148103714, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 0.9375, "rewards/DrugCombCOTFormatORM/std": 0.17078252136707306, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.725694477558136, "rewards/DrugCombCoverageCOTORM/std": 0.3899291753768921, "step": 5765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/mean_length": 478.5625, "completions/min_length": 377.0, "epoch": 8.479411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.817786455154419, "kl": 0.011020082049071789, "learning_rate": 7.129360923405627e-07, "loss": 0.00011095353693235666, "reward": 0.6071428656578064, "reward_std": 0.14346809685230255, "rewards/DrugCombAccuracyCOTORM/mean": 0.5714285969734192, "rewards/DrugCombAccuracyCOTORM/std": 0.47809144854545593, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 5766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 409.8125, "completions/min_length": 354.0, "epoch": 8.480882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.065115213394165, "kl": 0.01329636131413281, "learning_rate": 7.128199717382606e-07, "loss": 0.00013162195682525635, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 483.625, "completions/min_length": 387.0, "epoch": 8.48235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.026173798367381096, "kl": 0.012241940014064312, "learning_rate": 7.127038371159173e-07, "loss": 0.00012354757927823812, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 459.0625, "completions/min_length": 409.0, "epoch": 8.483823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.019347352907061577, "kl": 0.007076804409734905, "learning_rate": 7.125876884811835e-07, "loss": 7.051703869365156e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 465.6875, "completions/min_length": 413.0, "epoch": 8.485294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.0147055983543396, "kl": 0.009571653790771961, "learning_rate": 7.12471525841711e-07, "loss": 9.58563614403829e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 472.375, "completions/min_length": 426.0, "epoch": 8.486764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.7928053736686707, "kl": 0.011333212023600936, "learning_rate": 7.123553492051519e-07, "loss": 0.00011245091445744038, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 430.625, "completions/min_length": 373.0, "epoch": 8.488235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.016788626089692116, "kl": 0.009013410191982985, "learning_rate": 7.122391585791597e-07, "loss": 9.048281208379194e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 483.375, "completions/min_length": 401.0, "epoch": 8.489705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.013020589016377926, "kl": 0.010212840745225549, "learning_rate": 7.12122953971389e-07, "loss": 0.00010233103239443153, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 418.75, "completions/min_length": 372.0, "epoch": 8.491176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.014183788560330868, "kl": 0.010748877190053463, "learning_rate": 7.120067353894949e-07, "loss": 0.00010761710291262716, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 448.125, "completions/min_length": 382.0, "epoch": 8.492647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.2156532108783722, "kl": 0.01668748678639531, "learning_rate": 7.118905028411335e-07, "loss": 0.000167471167515032, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 510.9375, "completions/min_length": 433.0, "epoch": 8.494117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.9527208805084229, "kl": 0.010410310584120452, "learning_rate": 7.117742563339621e-07, "loss": 0.00010397285223007202, "reward": 0.3375000059604645, "reward_std": 0.22951814532279968, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.6191391944885254, "step": 5776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 458.4375, "completions/min_length": 406.0, "epoch": 8.495588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.03378468379378319, "kl": 0.0107945348136127, "learning_rate": 7.116579958756387e-07, "loss": 0.00010845320502994582, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 422.1875, "completions/min_length": 339.0, "epoch": 8.49705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.010637101717293262, "kl": 0.008589675067923963, "learning_rate": 7.11541721473822e-07, "loss": 8.606545452494174e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 440.4375, "completions/min_length": 385.0, "epoch": 8.498529411764705, "frac_reward_zero_std": 0.0, "grad_norm": 1.5713183879852295, "kl": 0.01430847030133009, "learning_rate": 7.114254331361721e-07, "loss": 0.00014357268810272217, "reward": 0.8500000238418579, "reward_std": 0.3265853524208069, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 431.9375, "completions/min_length": 339.0, "epoch": 8.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.04954492673277855, "kl": 0.00961208320222795, "learning_rate": 7.113091308703497e-07, "loss": 9.707942081149668e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 438.8125, "completions/min_length": 396.0, "epoch": 8.501470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.9197602868080139, "kl": 0.011127534788101912, "learning_rate": 7.111928146840165e-07, "loss": 0.00011104674922535196, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 5781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 482.0625, "completions/min_length": 378.0, "epoch": 8.50294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.2521759271621704, "kl": 0.010531695908866823, "learning_rate": 7.110764845848353e-07, "loss": 0.00010491162538528442, "reward": 0.9089285731315613, "reward_std": 0.20732080936431885, "rewards/DrugCombAccuracyCOTORM/mean": 0.9017857313156128, "rewards/DrugCombAccuracyCOTORM/std": 0.24863573908805847, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 424.3125, "completions/min_length": 324.0, "epoch": 8.504411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.7763090133666992, "kl": 0.010421182727441192, "learning_rate": 7.109601405804692e-07, "loss": 0.00010363136243540794, "reward": 0.7928333282470703, "reward_std": 0.23074394464492798, "rewards/DrugCombAccuracyCOTORM/mean": 0.7775000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.4020530879497528, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7083333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.6763190627098083, "step": 5783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 441.125, "completions/min_length": 402.0, "epoch": 8.505882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.027302566915750504, "kl": 0.009932392742484808, "learning_rate": 7.108437826785831e-07, "loss": 9.860776481218636e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 445.25, "completions/min_length": 385.0, "epoch": 8.507352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.9743170738220215, "kl": 0.008182231453247368, "learning_rate": 7.107274108868421e-07, "loss": 8.143450395436957e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/mean_length": 502.875, "completions/min_length": 378.0, "epoch": 8.508823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.8196120858192444, "kl": 0.008644426590763032, "learning_rate": 7.106110252129125e-07, "loss": 8.587518095737323e-05, "reward": 0.32500001788139343, "reward_std": 0.0707106739282608, "rewards/DrugCombAccuracyCOTORM/mean": 0.28125, "rewards/DrugCombAccuracyCOTORM/std": 0.3145764470100403, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 5786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 457.6875, "completions/min_length": 375.0, "epoch": 8.510294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.054382361471652985, "kl": 0.011236678808927536, "learning_rate": 7.104946256644616e-07, "loss": 0.00011337742762407288, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/mean_length": 504.625, "completions/min_length": 420.0, "epoch": 8.511764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8565008640289307, "kl": 0.015272820368409157, "learning_rate": 7.103782122491577e-07, "loss": 0.00015538872685283422, "reward": 0.659250020980835, "reward_std": 0.03417681157588959, "rewards/DrugCombAccuracyCOTORM/mean": 0.5975000262260437, "rewards/DrugCombAccuracyCOTORM/std": 0.3928613066673279, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.17078250646591187, "step": 5788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 437.5625, "completions/min_length": 351.0, "epoch": 8.513235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.03451893478631973, "kl": 0.011900147190317512, "learning_rate": 7.102617849746692e-07, "loss": 0.00011898669617949054, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 422.75, "completions/min_length": 387.0, "epoch": 8.514705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.013502391055226326, "kl": 0.010072557954117656, "learning_rate": 7.101453438486666e-07, "loss": 0.00010091383592225611, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 432.875, "completions/min_length": 333.0, "epoch": 8.516176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.01308246050029993, "kl": 0.008833653293550014, "learning_rate": 7.100288888788206e-07, "loss": 8.758304466027766e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 435.4375, "completions/min_length": 385.0, "epoch": 8.51764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.09497653692960739, "kl": 0.01145693869329989, "learning_rate": 7.099124200728027e-07, "loss": 0.00011405960685806349, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 5792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/mean_length": 416.5625, "completions/min_length": 382.0, "epoch": 8.519117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.007525064051151276, "kl": 0.006243035662919283, "learning_rate": 7.097959374382859e-07, "loss": 6.264390685828403e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 5793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 439.0, "completions/min_length": 357.0, "epoch": 8.520588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.012116478756070137, "kl": 0.009123729425482452, "learning_rate": 7.096794409829438e-07, "loss": 9.091986430576071e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 442.5, "completions/min_length": 392.0, "epoch": 8.522058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.011190433986485004, "kl": 0.009260257938876748, "learning_rate": 7.095629307144506e-07, "loss": 9.217869956046343e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 444.8125, "completions/min_length": 377.0, "epoch": 8.523529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8315771818161011, "kl": 0.011162506649270654, "learning_rate": 7.09446406640482e-07, "loss": 0.00011214613914489746, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 447.25, "completions/min_length": 371.0, "epoch": 8.525, "frac_reward_zero_std": 1.0, "grad_norm": 0.0109116705134511, "kl": 0.009026485029608011, "learning_rate": 7.09329868768714e-07, "loss": 9.011027577798814e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 431.375, "completions/min_length": 392.0, "epoch": 8.526470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.012770275585353374, "kl": 0.009578920435160398, "learning_rate": 7.09213317106824e-07, "loss": 9.62545454967767e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 436.0, "completions/min_length": 347.0, "epoch": 8.527941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.02011020854115486, "kl": 0.011342990212142467, "learning_rate": 7.090967516624903e-07, "loss": 0.00011366789840394631, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 418.125, "completions/min_length": 369.0, "epoch": 8.529411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.018426597118377686, "kl": 0.009936680784448981, "learning_rate": 7.089801724433917e-07, "loss": 0.00010026506788562983, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 444.125, "completions/min_length": 376.0, "epoch": 8.530882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.0718967914581299, "kl": 0.009088743943721056, "learning_rate": 7.088635794572082e-07, "loss": 9.076943388208747e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 419.0, "completions/min_length": 339.0, "epoch": 8.532352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.853454053401947, "kl": 0.009441409725695848, "learning_rate": 7.087469727116207e-07, "loss": 9.411761129740626e-05, "reward": 0.6625000238418579, "reward_std": 0.2133909910917282, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 5802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 439.25, "completions/min_length": 385.0, "epoch": 8.533823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.9343629479408264, "kl": 0.013044564053416252, "learning_rate": 7.086303522143106e-07, "loss": 0.00013020634651184082, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 425.125, "completions/min_length": 337.0, "epoch": 8.535294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.0814741849899292, "kl": 0.013198813190683722, "learning_rate": 7.085137179729612e-07, "loss": 0.00013108212442602962, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 466.5, "completions/min_length": 420.0, "epoch": 8.536764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.008468310348689556, "kl": 0.007213353645056486, "learning_rate": 7.083970699952556e-07, "loss": 7.172790355980396e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 472.1875, "completions/min_length": 395.0, "epoch": 8.538235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.0144078703597188, "kl": 0.009701123461127281, "learning_rate": 7.082804082888786e-07, "loss": 9.677216439740732e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/mean_length": 499.5625, "completions/min_length": 397.0, "epoch": 8.53970588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.028270360082387924, "kl": 0.011930118780583143, "learning_rate": 7.081637328615153e-07, "loss": 0.00011834440374514088, "reward": 0.800000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 445.1875, "completions/min_length": 411.0, "epoch": 8.541176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.012728692032396793, "kl": 0.0101060502929613, "learning_rate": 7.08047043720852e-07, "loss": 0.00010098198981722817, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 454.4375, "completions/min_length": 414.0, "epoch": 8.54264705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.2221548557281494, "kl": 0.00881353160366416, "learning_rate": 7.079303408745759e-07, "loss": 8.820369839668274e-05, "reward": 0.6812500357627869, "reward_std": 0.43991678953170776, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 5809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 462.375, "completions/min_length": 404.0, "epoch": 8.544117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.028822679072618484, "kl": 0.010750388726592064, "learning_rate": 7.078136243303752e-07, "loss": 0.00010847895464394242, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 462.0625, "completions/min_length": 395.0, "epoch": 8.545588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.025821497663855553, "kl": 0.01230127620510757, "learning_rate": 7.076968940959391e-07, "loss": 0.00012258486822247505, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 423.9375, "completions/min_length": 376.0, "epoch": 8.547058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.015351532027125359, "kl": 0.01047907373867929, "learning_rate": 7.075801501789569e-07, "loss": 0.000103734913864173, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 444.5, "completions/min_length": 413.0, "epoch": 8.548529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8447387218475342, "kl": 0.01653708633966744, "learning_rate": 7.074633925871198e-07, "loss": 0.00016399308515246958, "reward": 0.543749988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 454.875, "completions/min_length": 397.0, "epoch": 8.55, "frac_reward_zero_std": 0.5, "grad_norm": 1.166919469833374, "kl": 0.013547098729759455, "learning_rate": 7.073466213281195e-07, "loss": 0.00013687789032701403, "reward": 0.7937500476837158, "reward_std": 0.22109711170196533, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 482.5625, "completions/min_length": 427.0, "epoch": 8.551470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 0.7143943309783936, "kl": 0.010922303888946772, "learning_rate": 7.072298364096485e-07, "loss": 0.0001084020477719605, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 427.4375, "completions/min_length": 376.0, "epoch": 8.552941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.8232145309448242, "kl": 0.009007551590912044, "learning_rate": 7.071130378394003e-07, "loss": 8.792922017164528e-05, "reward": 0.8296874761581421, "reward_std": 0.23508523404598236, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 5816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 444.125, "completions/min_length": 392.0, "epoch": 8.554411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 1.7347612380981445, "kl": 0.012480276636779308, "learning_rate": 7.069962256250694e-07, "loss": 0.00012581050395965576, "reward": 0.7875000238418579, "reward_std": 0.3837963938713074, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 462.5625, "completions/min_length": 404.0, "epoch": 8.555882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.016282884404063225, "kl": 0.00973003963008523, "learning_rate": 7.068793997743508e-07, "loss": 9.91923370747827e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 458.4375, "completions/min_length": 408.0, "epoch": 8.55735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1812160015106201, "kl": 0.011475927196443081, "learning_rate": 7.06762560294941e-07, "loss": 0.00011528821778483689, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 5819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/mean_length": 414.5, "completions/min_length": 390.0, "epoch": 8.558823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.01589733175933361, "kl": 0.009490460506640375, "learning_rate": 7.066457071945369e-07, "loss": 9.433654486201704e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 469.125, "completions/min_length": 421.0, "epoch": 8.560294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.016159040853381157, "kl": 0.010592264356091619, "learning_rate": 7.065288404808367e-07, "loss": 0.0001060556314769201, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 492.1875, "completions/min_length": 419.0, "epoch": 8.561764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.010520796291530132, "kl": 0.00907086452934891, "learning_rate": 7.06411960161539e-07, "loss": 9.071882232092321e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 398.5625, "completions/min_length": 344.0, "epoch": 8.563235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 1.1051039695739746, "kl": 0.011128159239888191, "learning_rate": 7.062950662443439e-07, "loss": 0.00011125813762191683, "reward": 0.5625, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 5823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/mean_length": 525.5, "completions/min_length": 422.0, "epoch": 8.564705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.105104684829712, "kl": 0.013766341377049685, "learning_rate": 7.061781587369518e-07, "loss": 0.0001401312038069591, "reward": 0.9100833535194397, "reward_std": 0.1729465276002884, "rewards/DrugCombAccuracyCOTORM/mean": 0.90583336353302, "rewards/DrugCombAccuracyCOTORM/std": 0.2564472258090973, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.49767982959747314, "step": 5824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 431.5625, "completions/min_length": 374.0, "epoch": 8.566176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.7942101359367371, "kl": 0.010801593773066998, "learning_rate": 7.060612376470643e-07, "loss": 0.00010760128498077393, "reward": 0.543749988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/mean_length": 523.5, "completions/min_length": 427.0, "epoch": 8.56764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.018727360293269157, "kl": 0.009905047481879592, "learning_rate": 7.05944302982384e-07, "loss": 9.970598330255598e-05, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.12909944355487823, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 463.625, "completions/min_length": 408.0, "epoch": 8.569117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.0804404020309448, "kl": 0.011854906333610415, "learning_rate": 7.058273547506142e-07, "loss": 0.00011780112981796265, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 525.0625, "completions/min_length": 456.0, "epoch": 8.570588235294117, "frac_reward_zero_std": 0.0, "grad_norm": 1.4658913612365723, "kl": 0.015967058716341853, "learning_rate": 7.057103929594592e-07, "loss": 0.00015979260206222534, "reward": 0.4361041784286499, "reward_std": 0.16841046512126923, "rewards/DrugCombAccuracyCOTORM/mean": 0.34591144323349, "rewards/DrugCombAccuracyCOTORM/std": 0.3536726236343384, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.59375, "rewards/DrugCombCoverageCOTORM/std": 0.2719528079032898, "step": 5828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 411.3125, "completions/min_length": 342.0, "epoch": 8.572058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.9942668080329895, "kl": 0.010179930832237005, "learning_rate": 7.055934176166241e-07, "loss": 0.00010222196578979492, "reward": 0.75, "reward_std": 0.20701967179775238, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 442.375, "completions/min_length": 359.0, "epoch": 8.573529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 0.87620609998703, "kl": 0.0119601059705019, "learning_rate": 7.054764287298149e-07, "loss": 0.00011952966451644897, "reward": 0.6736666560173035, "reward_std": 0.07983177155256271, "rewards/DrugCombAccuracyCOTORM/mean": 0.6207291483879089, "rewards/DrugCombAccuracyCOTORM/std": 0.4122130870819092, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7708333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.28463754057884216, "step": 5830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 483.875, "completions/min_length": 389.0, "epoch": 8.575, "frac_reward_zero_std": 0.0, "grad_norm": 1.318196415901184, "kl": 0.012677163118496537, "learning_rate": 7.053594263067386e-07, "loss": 0.0001287311315536499, "reward": 0.4937500059604645, "reward_std": 0.19756440818309784, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.18130187690258026, "step": 5831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 446.1875, "completions/min_length": 362.0, "epoch": 8.576470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9420666694641113, "kl": 0.009967640973627567, "learning_rate": 7.05242410355103e-07, "loss": 9.981472976505756e-05, "reward": 0.925000011920929, "reward_std": 0.1035098284482956, "rewards/DrugCombAccuracyCOTORM/mean": 0.90625, "rewards/DrugCombAccuracyCOTORM/std": 0.20155644416809082, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 457.375, "completions/min_length": 388.0, "epoch": 8.577941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.8169921636581421, "kl": 0.008681154809892178, "learning_rate": 7.051253808826168e-07, "loss": 8.682906627655029e-05, "reward": 0.824999988079071, "reward_std": 0.19820623099803925, "rewards/DrugCombAccuracyCOTORM/mean": 0.78125, "rewards/DrugCombAccuracyCOTORM/std": 0.4069705307483673, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 475.625, "completions/min_length": 442.0, "epoch": 8.579411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.9969387054443359, "kl": 0.01156494952738285, "learning_rate": 7.050083378969896e-07, "loss": 0.00011563301086425781, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 433.9375, "completions/min_length": 385.0, "epoch": 8.580882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.0368483066558838, "kl": 0.011180685833096504, "learning_rate": 7.04891281405932e-07, "loss": 0.00011158917914144695, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 423.375, "completions/min_length": 370.0, "epoch": 8.58235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9233930110931396, "kl": 0.010182000696659088, "learning_rate": 7.047742114171552e-07, "loss": 0.00010204315185546875, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 469.375, "completions/min_length": 407.0, "epoch": 8.583823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.2326678037643433, "kl": 0.012000918970443308, "learning_rate": 7.046571279383715e-07, "loss": 0.00012077391147613525, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/mean_length": 473.25, "completions/min_length": 358.0, "epoch": 8.58529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.0027315616607666, "kl": 0.010236020782031119, "learning_rate": 7.045400309772942e-07, "loss": 0.00010254586231894791, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/mean_length": 504.8125, "completions/min_length": 431.0, "epoch": 8.586764705882352, "frac_reward_zero_std": 0.0, "grad_norm": 1.3322995901107788, "kl": 0.02026458946056664, "learning_rate": 7.044229205416373e-07, "loss": 0.00020211189985275269, "reward": 0.6911458373069763, "reward_std": 0.26761481165885925, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.4149966835975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9114583134651184, "rewards/DrugCombCoverageCOTORM/std": 0.11967839300632477, "step": 5839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/mean_length": 486.75, "completions/min_length": 438.0, "epoch": 8.588235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.05067615211009979, "kl": 0.015804708236828446, "learning_rate": 7.043057966391157e-07, "loss": 0.0001575978531036526, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 500.6875, "completions/min_length": 448.0, "epoch": 8.589705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.8528690934181213, "kl": 0.009413492167368531, "learning_rate": 7.041886592774454e-07, "loss": 9.326083090854809e-05, "reward": 0.8614583611488342, "reward_std": 0.08952446281909943, "rewards/DrugCombAccuracyCOTORM/mean": 0.8541666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.20069323480129242, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.78125, "rewards/DrugCombCoverageCOTORM/std": 0.5153881907463074, "step": 5841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 431.0, "completions/min_length": 383.0, "epoch": 8.591176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.447059988975525, "kl": 0.02000249526463449, "learning_rate": 7.040715084643429e-07, "loss": 0.0001985505223274231, "reward": 0.46033334732055664, "reward_std": 0.2449216991662979, "rewards/DrugCombAccuracyCOTORM/mean": 0.3462499976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.4017109274864197, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 5842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 433.5, "completions/min_length": 357.0, "epoch": 8.592647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.0492569208145142, "kl": 0.01254019234329462, "learning_rate": 7.039543442075258e-07, "loss": 0.00012627802789211273, "reward": 0.6563905477523804, "reward_std": 0.1672094464302063, "rewards/DrugCombAccuracyCOTORM/mean": 0.6243944764137268, "rewards/DrugCombAccuracyCOTORM/std": 0.4478762745857239, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5687500238418579, "rewards/DrugCombCoverageCOTORM/std": 0.7844053506851196, "step": 5843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 445.4375, "completions/min_length": 388.0, "epoch": 8.594117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.02803797461092472, "kl": 0.010578607907518744, "learning_rate": 7.038371665147126e-07, "loss": 0.00010529712017159909, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 432.4375, "completions/min_length": 348.0, "epoch": 8.595588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.0695298910140991, "kl": 0.010237528244033456, "learning_rate": 7.037199753936227e-07, "loss": 0.00010071630822494626, "reward": 0.23750001192092896, "reward_std": 0.219983771443367, "rewards/DrugCombAccuracyCOTORM/mean": 0.1875, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 5845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 427.125, "completions/min_length": 395.0, "epoch": 8.597058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.011710268445312977, "kl": 0.009440785041078925, "learning_rate": 7.036027708519765e-07, "loss": 9.466473420616239e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/mean_length": 530.125, "completions/min_length": 459.0, "epoch": 8.598529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.9160652160644531, "kl": 0.010028762510046363, "learning_rate": 7.03485552897495e-07, "loss": 0.00010089203715324402, "reward": 0.6974523663520813, "reward_std": 0.18710018694400787, "rewards/DrugCombAccuracyCOTORM/mean": 0.6313021183013916, "rewards/DrugCombAccuracyCOTORM/std": 0.49217140674591064, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9241071343421936, "rewards/DrugCombCoverageCOTORM/std": 0.20737877488136292, "step": 5847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/mean_length": 505.25, "completions/min_length": 423.0, "epoch": 8.6, "frac_reward_zero_std": 0.5, "grad_norm": 0.7100133895874023, "kl": 0.011211567791178823, "learning_rate": 7.033683215379002e-07, "loss": 0.00011236965656280518, "reward": 0.6416666507720947, "reward_std": 0.1458418369293213, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 5848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 473.0, "completions/min_length": 393.0, "epoch": 8.601470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.00956677831709385, "kl": 0.008933071279898286, "learning_rate": 7.03251076780915e-07, "loss": 8.941847772803158e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 490.4375, "completions/min_length": 402.0, "epoch": 8.602941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.012996691279113293, "kl": 0.009295198018662632, "learning_rate": 7.031338186342632e-07, "loss": 9.202474029734731e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 428.875, "completions/min_length": 341.0, "epoch": 8.604411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.0239675045013428, "kl": 0.011679081595502794, "learning_rate": 7.030165471056695e-07, "loss": 0.00011879950761795044, "reward": 0.7250000238418579, "reward_std": 0.23145504295825958, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 5851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 447.8125, "completions/min_length": 372.0, "epoch": 8.605882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.016289275139570236, "kl": 0.011889780405908823, "learning_rate": 7.028992622028596e-07, "loss": 0.00011809049465227872, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 523.8125, "completions/min_length": 435.0, "epoch": 8.60735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8459576368331909, "kl": 0.010298358742147684, "learning_rate": 7.027819639335597e-07, "loss": 0.0001027621328830719, "reward": 0.9302083253860474, "reward_std": 0.09717614203691483, "rewards/DrugCombAccuracyCOTORM/mean": 0.9166666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.18257419764995575, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 5853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 455.75, "completions/min_length": 398.0, "epoch": 8.608823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.9897149801254272, "kl": 0.01413254695944488, "learning_rate": 7.026646523054971e-07, "loss": 0.00014196049596648663, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 504.625, "completions/min_length": 480.0, "epoch": 8.610294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.008084931410849094, "kl": 0.0068064716178923845, "learning_rate": 7.025473273264001e-07, "loss": 6.825741729699075e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 479.375, "completions/min_length": 389.0, "epoch": 8.611764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8315877914428711, "kl": 0.00839724054094404, "learning_rate": 7.024299890039978e-07, "loss": 8.313309808727354e-05, "reward": 0.925000011920929, "reward_std": 0.14880475401878357, "rewards/DrugCombAccuracyCOTORM/mean": 0.90625, "rewards/DrugCombAccuracyCOTORM/std": 0.2719528079032898, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 456.5625, "completions/min_length": 394.0, "epoch": 8.613235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0197595357894897, "kl": 0.01718355342745781, "learning_rate": 7.023126373460201e-07, "loss": 0.00017150864005088806, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 5857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 440.625, "completions/min_length": 364.0, "epoch": 8.614705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.5979015231132507, "kl": 0.022326285834424198, "learning_rate": 7.021952723601979e-07, "loss": 0.0002264260547235608, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/mean_length": 472.375, "completions/min_length": 354.0, "epoch": 8.616176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.8581904768943787, "kl": 0.011148914927616715, "learning_rate": 7.020778940542629e-07, "loss": 0.00011105090379714966, "reward": 0.800000011920929, "reward_std": 0.17457431554794312, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.394405335187912, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 505.875, "completions/min_length": 446.0, "epoch": 8.617647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.8101065754890442, "kl": 0.01154427626170218, "learning_rate": 7.019605024359474e-07, "loss": 0.00011506615555845201, "reward": 0.22708332538604736, "reward_std": 0.0766032412648201, "rewards/DrugCombAccuracyCOTORM/mean": 0.0416666679084301, "rewards/DrugCombAccuracyCOTORM/std": 0.1666666716337204, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 421.9375, "completions/min_length": 374.0, "epoch": 8.619117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0431986078619957, "kl": 0.013784510316327214, "learning_rate": 7.018430975129852e-07, "loss": 0.00014004674449097365, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 455.9375, "completions/min_length": 411.0, "epoch": 8.620588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.880446195602417, "kl": 0.010382321081124246, "learning_rate": 7.017256792931106e-07, "loss": 0.00010398583253845572, "reward": 0.9833333492279053, "reward_std": 0.047140445560216904, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 441.0, "completions/min_length": 398.0, "epoch": 8.62205882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9857215285301208, "kl": 0.013430570485070348, "learning_rate": 7.016082477840588e-07, "loss": 0.0001336311543127522, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 414.1875, "completions/min_length": 370.0, "epoch": 8.623529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 54.89106369018555, "kl": 0.6720729070948437, "learning_rate": 7.014908029935658e-07, "loss": 0.006194527726620436, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 5864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 433.0, "completions/min_length": 392.0, "epoch": 8.625, "frac_reward_zero_std": 0.5, "grad_norm": 1.0329318046569824, "kl": 0.014504330465570092, "learning_rate": 7.013733449293686e-07, "loss": 0.00014621764421463013, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 5865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/mean_length": 528.9375, "completions/min_length": 458.0, "epoch": 8.626470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.024480769410729408, "kl": 0.012886508833616972, "learning_rate": 7.012558735992051e-07, "loss": 0.00012909303768537939, "reward": 0.8666666746139526, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.8333333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.17213258147239685, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 434.1875, "completions/min_length": 385.0, "epoch": 8.62794117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.04978882893919945, "kl": 0.01333214109763503, "learning_rate": 7.011383890108138e-07, "loss": 0.00013483024667948484, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/mean_length": 498.0, "completions/min_length": 415.0, "epoch": 8.629411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.5164275169372559, "kl": 0.011754029663279653, "learning_rate": 7.010208911719346e-07, "loss": 0.00011645257472991943, "reward": 0.84375, "reward_std": 0.22469735145568848, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 460.4375, "completions/min_length": 391.0, "epoch": 8.630882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.01692645065486431, "kl": 0.010500545962713659, "learning_rate": 7.009033800903077e-07, "loss": 0.00010472388385096565, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 5869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 444.0625, "completions/min_length": 377.0, "epoch": 8.632352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.032381631433963776, "kl": 0.01144140143878758, "learning_rate": 7.007858557736747e-07, "loss": 0.00011514008656376973, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 444.5625, "completions/min_length": 417.0, "epoch": 8.633823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.0009315013885498, "kl": 0.009931974578648806, "learning_rate": 7.006683182297775e-07, "loss": 9.883195161819458e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 409.625, "completions/min_length": 351.0, "epoch": 8.635294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.01920657604932785, "kl": 0.011379272444173694, "learning_rate": 7.005507674663593e-07, "loss": 0.00011413395986892283, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 462.5, "completions/min_length": 412.0, "epoch": 8.636764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9733672142028809, "kl": 0.011018997756764293, "learning_rate": 7.004332034911642e-07, "loss": 0.00010962348460452631, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/mean_length": 426.8125, "completions/min_length": 402.0, "epoch": 8.638235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.012586945667862892, "kl": 0.011534627294167876, "learning_rate": 7.003156263119367e-07, "loss": 0.00011561702558537945, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 420.625, "completions/min_length": 366.0, "epoch": 8.639705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.009782455861568451, "kl": 0.007324477657675743, "learning_rate": 7.001980359364229e-07, "loss": 7.324980106204748e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/mean_length": 455.625, "completions/min_length": 366.0, "epoch": 8.641176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.1661038398742676, "kl": 0.01614005258306861, "learning_rate": 7.000804323723691e-07, "loss": 0.00016744995082262903, "reward": 0.8767499923706055, "reward_std": 0.17010116577148438, "rewards/DrugCombAccuracyCOTORM/mean": 0.8537499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.31442803144454956, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.13437095284461975, "step": 5876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 461.6875, "completions/min_length": 404.0, "epoch": 8.64264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9324246644973755, "kl": 0.013323740800842643, "learning_rate": 6.999628156275226e-07, "loss": 0.00013291090726852417, "reward": 0.9833333492279053, "reward_std": 0.047140445560216904, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 400.8125, "completions/min_length": 317.0, "epoch": 8.644117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.014874191954731941, "kl": 0.007684677722863853, "learning_rate": 6.99845185709632e-07, "loss": 7.64330179663375e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 472.6875, "completions/min_length": 423.0, "epoch": 8.645588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.018565217033028603, "kl": 0.01264009065926075, "learning_rate": 6.997275426264462e-07, "loss": 0.00012635010352823883, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/mean_length": 500.25, "completions/min_length": 425.0, "epoch": 8.647058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 0.909130871295929, "kl": 0.008693502051755786, "learning_rate": 6.996098863857154e-07, "loss": 8.715557487448677e-05, "reward": 0.9869999885559082, "reward_std": 0.03676954284310341, "rewards/DrugCombAccuracyCOTORM/mean": 0.9837499856948853, "rewards/DrugCombAccuracyCOTORM/std": 0.06499999761581421, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 428.875, "completions/min_length": 357.0, "epoch": 8.648529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.013977051712572575, "kl": 0.00821720075327903, "learning_rate": 6.994922169951904e-07, "loss": 8.189524669433013e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 460.5625, "completions/min_length": 399.0, "epoch": 8.65, "frac_reward_zero_std": 0.5, "grad_norm": 0.9394234418869019, "kl": 0.011414137901738286, "learning_rate": 6.993745344626231e-07, "loss": 0.00011439944501034915, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 468.75, "completions/min_length": 380.0, "epoch": 8.651470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.013275250792503357, "kl": 0.007127301883883774, "learning_rate": 6.992568387957659e-07, "loss": 7.124055264284834e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 482.75, "completions/min_length": 404.0, "epoch": 8.652941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.849844217300415, "kl": 0.009669220889918506, "learning_rate": 6.991391300023725e-07, "loss": 9.66973602771759e-05, "reward": 0.925000011920929, "reward_std": 0.1035098284482956, "rewards/DrugCombAccuracyCOTORM/mean": 0.90625, "rewards/DrugCombAccuracyCOTORM/std": 0.20155644416809082, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 413.1875, "completions/min_length": 373.0, "epoch": 8.654411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.011652226559817791, "kl": 0.009433438302949071, "learning_rate": 6.990214080901971e-07, "loss": 9.467914060223848e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 451.4375, "completions/min_length": 411.0, "epoch": 8.655882352941177, "frac_reward_zero_std": 0.0, "grad_norm": 1.354954719543457, "kl": 0.010931517230346799, "learning_rate": 6.989036730669951e-07, "loss": 0.00010926276445388794, "reward": 0.893750011920929, "reward_std": 0.20284168422222137, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/mean_length": 423.8125, "completions/min_length": 387.0, "epoch": 8.657352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.017715787515044212, "kl": 0.010374655947089195, "learning_rate": 6.987859249405224e-07, "loss": 0.00010282419680152088, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/mean_length": 568.9375, "completions/min_length": 496.0, "epoch": 8.658823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 1.3681633472442627, "kl": 0.011209028773009777, "learning_rate": 6.986681637185359e-07, "loss": 0.00011271238327026367, "reward": 0.5734583139419556, "reward_std": 0.34328925609588623, "rewards/DrugCombAccuracyCOTORM/mean": 0.48374998569488525, "rewards/DrugCombAccuracyCOTORM/std": 0.43568912148475647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8645833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.19454362988471985, "step": 5888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 445.875, "completions/min_length": 378.0, "epoch": 8.660294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.01267361082136631, "kl": 0.010023597860708833, "learning_rate": 6.985503894087939e-07, "loss": 0.0001008398539852351, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 457.5, "completions/min_length": 410.0, "epoch": 8.661764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.016925092786550522, "kl": 0.0091048632748425, "learning_rate": 6.984326020190543e-07, "loss": 9.086232603294775e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/mean_length": 512.375, "completions/min_length": 457.0, "epoch": 8.663235294117648, "frac_reward_zero_std": 0.5, "grad_norm": 0.9269264340400696, "kl": 0.012020852882415056, "learning_rate": 6.983148015570772e-07, "loss": 0.00012012671504635364, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/mean_length": 419.75, "completions/min_length": 359.0, "epoch": 8.66470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9589753150939941, "kl": 0.008461955701932311, "learning_rate": 6.98196988030623e-07, "loss": 8.443515980616212e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 450.125, "completions/min_length": 404.0, "epoch": 8.666176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.0085498094558716, "kl": 0.008496582973748446, "learning_rate": 6.980791614474526e-07, "loss": 8.456408977508545e-05, "reward": 0.5874999761581421, "reward_std": 0.007715167012065649, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.1666666567325592, "step": 5893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 488.125, "completions/min_length": 426.0, "epoch": 8.66764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.01100570522248745, "kl": 0.007780812215059996, "learning_rate": 6.979613218153285e-07, "loss": 7.816482684575021e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/mean_length": 522.1875, "completions/min_length": 467.0, "epoch": 8.669117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.8743179440498352, "kl": 0.010915480786934495, "learning_rate": 6.978434691420134e-07, "loss": 0.00010981658851960674, "reward": 0.8967083692550659, "reward_std": 0.15700268745422363, "rewards/DrugCombAccuracyCOTORM/mean": 0.87479168176651, "rewards/DrugCombAccuracyCOTORM/std": 0.2892204523086548, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.08539126068353653, "step": 5895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 463.3125, "completions/min_length": 384.0, "epoch": 8.670588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.8512385487556458, "kl": 0.009876928408630192, "learning_rate": 6.977256034352712e-07, "loss": 9.883195161819458e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 468.0, "completions/min_length": 373.0, "epoch": 8.672058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.019785817712545395, "kl": 0.012291510822251439, "learning_rate": 6.976077247028666e-07, "loss": 0.00012273917673155665, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 5897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 478.9375, "completions/min_length": 423.0, "epoch": 8.673529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.012904293835163116, "kl": 0.008018174325115979, "learning_rate": 6.974898329525654e-07, "loss": 7.958063361002132e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/mean_length": 502.625, "completions/min_length": 403.0, "epoch": 8.675, "frac_reward_zero_std": 0.0, "grad_norm": 1.3221244812011719, "kl": 0.013942980440333486, "learning_rate": 6.973719281921336e-07, "loss": 0.0001386106014251709, "reward": 0.7358125448226929, "reward_std": 0.351374089717865, "rewards/DrugCombAccuracyCOTORM/mean": 0.7029687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.4585931599140167, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.734375, "rewards/DrugCombCoverageCOTORM/std": 0.4422362744808197, "step": 5899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/mean_length": 393.5, "completions/min_length": 316.0, "epoch": 8.676470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.011788069270551205, "kl": 0.008653754135593772, "learning_rate": 6.972540104293388e-07, "loss": 8.599179273005575e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 437.1875, "completions/min_length": 375.0, "epoch": 8.677941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 3.5066423416137695, "kl": 0.011053416877985, "learning_rate": 6.971360796719488e-07, "loss": 0.00010971724987030029, "reward": 0.593666672706604, "reward_std": 0.2196744978427887, "rewards/DrugCombAccuracyCOTORM/mean": 0.5337499976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.43054813146591187, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5163978338241577, "step": 5901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 470.0625, "completions/min_length": 429.0, "epoch": 8.679411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.8671658635139465, "kl": 0.00959442206658423, "learning_rate": 6.970181359277327e-07, "loss": 9.616252646083012e-05, "reward": 0.8190000057220459, "reward_std": 0.1637842208147049, "rewards/DrugCombAccuracyCOTORM/mean": 0.784166693687439, "rewards/DrugCombAccuracyCOTORM/std": 0.34634920954704285, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.14907118678092957, "step": 5902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 472.9375, "completions/min_length": 361.0, "epoch": 8.680882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.0239804983139038, "kl": 0.01455468381755054, "learning_rate": 6.969001792044607e-07, "loss": 0.00014517316594719887, "reward": 0.7943333387374878, "reward_std": 0.16863994300365448, "rewards/DrugCombAccuracyCOTORM/mean": 0.7637500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.3695680797100067, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.22771002352237701, "step": 5903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 489.5, "completions/min_length": 440.0, "epoch": 8.68235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.021484613418579, "kl": 0.01471368782222271, "learning_rate": 6.967822095099029e-07, "loss": 0.00014733336865901947, "reward": 0.7791666984558105, "reward_std": 0.23767225444316864, "rewards/DrugCombAccuracyCOTORM/mean": 0.7708333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.39849257469177246, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 5904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 476.625, "completions/min_length": 437.0, "epoch": 8.683823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.7668601274490356, "kl": 0.011477270280010998, "learning_rate": 6.966642268518313e-07, "loss": 0.0001148453084169887, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 448.875, "completions/min_length": 408.0, "epoch": 8.685294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.05677285045385361, "kl": 0.01429801108315587, "learning_rate": 6.965462312380182e-07, "loss": 0.00014409144932869822, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 441.5, "completions/min_length": 414.0, "epoch": 8.686764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.034405793994665146, "kl": 0.014254104346036911, "learning_rate": 6.964282226762366e-07, "loss": 0.0001433478610124439, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 484.25, "completions/min_length": 428.0, "epoch": 8.688235294117646, "frac_reward_zero_std": 0.0, "grad_norm": 1.2232521772384644, "kl": 0.00929804251063615, "learning_rate": 6.963102011742608e-07, "loss": 9.315460920333862e-05, "reward": 0.8091250061988831, "reward_std": 0.3534318208694458, "rewards/DrugCombAccuracyCOTORM/mean": 0.7809374928474426, "rewards/DrugCombAccuracyCOTORM/std": 0.3970473110675812, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.3400367796421051, "step": 5908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 462.0625, "completions/min_length": 415.0, "epoch": 8.689705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.1535909175872803, "kl": 0.01168921240605414, "learning_rate": 6.961921667398658e-07, "loss": 0.00011560320854187012, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 460.125, "completions/min_length": 416.0, "epoch": 8.691176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.02570868842303753, "kl": 0.0110832538921386, "learning_rate": 6.960741193808273e-07, "loss": 0.00011047490988858044, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 429.6875, "completions/min_length": 359.0, "epoch": 8.69264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.012882876209914684, "kl": 0.00973810045979917, "learning_rate": 6.95956059104922e-07, "loss": 9.789083560463041e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 416.0, "completions/min_length": 332.0, "epoch": 8.694117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.010917650535702705, "kl": 0.009456513449549675, "learning_rate": 6.958379859199276e-07, "loss": 9.409485210198909e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 415.875, "completions/min_length": 363.0, "epoch": 8.695588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.014571095816791058, "kl": 0.009822425548918545, "learning_rate": 6.957198998336222e-07, "loss": 9.846928878687322e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 488.75, "completions/min_length": 429.0, "epoch": 8.697058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.0005221366882324, "kl": 0.012056898791342974, "learning_rate": 6.956018008537851e-07, "loss": 0.00012044215691275895, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 515.25, "completions/min_length": 439.0, "epoch": 8.698529411764707, "frac_reward_zero_std": 0.0, "grad_norm": 1.4429594278335571, "kl": 0.015800709137693048, "learning_rate": 6.954836889881964e-07, "loss": 0.00016224756836891174, "reward": 0.71875, "reward_std": 0.4412066340446472, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 5915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 445.1875, "completions/min_length": 366.0, "epoch": 8.7, "frac_reward_zero_std": 0.0, "grad_norm": 1.3916035890579224, "kl": 0.011200163746252656, "learning_rate": 6.953655642446367e-07, "loss": 0.00011126697063446045, "reward": 0.8589166402816772, "reward_std": 0.3013652563095093, "rewards/DrugCombAccuracyCOTORM/mean": 0.8262500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.3764195442199707, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 5916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 470.5, "completions/min_length": 426.0, "epoch": 8.701470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.016155924648046494, "kl": 0.008370772120542824, "learning_rate": 6.952474266308883e-07, "loss": 8.348537085112184e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 471.9375, "completions/min_length": 393.0, "epoch": 8.702941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.015261913649737835, "kl": 0.010323918424546719, "learning_rate": 6.951292761547337e-07, "loss": 0.00010286684846505523, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 5918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 450.3125, "completions/min_length": 385.0, "epoch": 8.704411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 1.671289086341858, "kl": 0.014417890692129731, "learning_rate": 6.950111128239559e-07, "loss": 0.00014390796422958374, "reward": 0.22500000894069672, "reward_std": 0.18771237134933472, "rewards/DrugCombAccuracyCOTORM/mean": 0.0625, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 5919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 467.5, "completions/min_length": 406.0, "epoch": 8.705882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.015148692764341831, "kl": 0.011020004283636808, "learning_rate": 6.948929366463396e-07, "loss": 0.00011081036791438237, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 463.25, "completions/min_length": 394.0, "epoch": 8.70735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8711303472518921, "kl": 0.008969148620963097, "learning_rate": 6.947747476296699e-07, "loss": 8.927803719416261e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 455.5625, "completions/min_length": 381.0, "epoch": 8.708823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 1.7495396137237549, "kl": 0.012519070180132985, "learning_rate": 6.946565457817328e-07, "loss": 0.00012559443712234497, "reward": 0.7562500238418579, "reward_std": 0.4178736209869385, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.8139410614967346, "step": 5922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/mean_length": 460.25, "completions/min_length": 346.0, "epoch": 8.71029411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.7405188083648682, "kl": 0.011043467791751027, "learning_rate": 6.945383311103148e-07, "loss": 0.00011035367788281292, "reward": 0.7002869844436646, "reward_std": 0.16056320071220398, "rewards/DrugCombAccuracyCOTORM/mean": 0.6583448648452759, "rewards/DrugCombAccuracyCOTORM/std": 0.435886412858963, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7361111640930176, "rewards/DrugCombCoverageCOTORM/std": 0.3267875909805298, "step": 5923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/mean_length": 495.8125, "completions/min_length": 413.0, "epoch": 8.711764705882352, "frac_reward_zero_std": 0.5, "grad_norm": 0.8141112327575684, "kl": 0.010597290471196175, "learning_rate": 6.944201036232041e-07, "loss": 0.00010582504910416901, "reward": 0.5958333015441895, "reward_std": 0.0117851123213768, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.1666666567325592, "step": 5924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 520.625, "completions/min_length": 425.0, "epoch": 8.713235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.3928749561309814, "kl": 0.012167822336778045, "learning_rate": 6.94301863328189e-07, "loss": 0.00012220442295074463, "reward": 0.503166675567627, "reward_std": 0.3183606266975403, "rewards/DrugCombAccuracyCOTORM/mean": 0.4050000011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.41787824034690857, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.24720662832260132, "step": 5925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 474.0625, "completions/min_length": 394.0, "epoch": 8.714705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.01577945426106453, "kl": 0.007596643292345107, "learning_rate": 6.941836102330587e-07, "loss": 7.584081322420388e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 463.5, "completions/min_length": 388.0, "epoch": 8.716176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.01027607824653387, "kl": 0.011326419655233622, "learning_rate": 6.940653443456038e-07, "loss": 0.00011314755101921037, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 461.0625, "completions/min_length": 367.0, "epoch": 8.717647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.109809160232544, "kl": 0.013995148008689284, "learning_rate": 6.939470656736148e-07, "loss": 0.00013968173880130053, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 5928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 437.5, "completions/min_length": 396.0, "epoch": 8.719117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.0800296887755394, "kl": 0.013915414456278086, "learning_rate": 6.938287742248842e-07, "loss": 0.0001380040484946221, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 456.8125, "completions/min_length": 405.0, "epoch": 8.720588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.1374388933181763, "kl": 0.012748950626701117, "learning_rate": 6.937104700072045e-07, "loss": 0.00012758324737660587, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 5930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 497.0625, "completions/min_length": 439.0, "epoch": 8.722058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.8084521293640137, "kl": 0.009249790920875967, "learning_rate": 6.935921530283691e-07, "loss": 9.283743565902114e-05, "reward": 0.7091250419616699, "reward_std": 0.1821604073047638, "rewards/DrugCombAccuracyCOTORM/mean": 0.6559374928474426, "rewards/DrugCombAccuracyCOTORM/std": 0.4617077708244324, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.3010398745536804, "step": 5931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/mean_length": 486.25, "completions/min_length": 410.0, "epoch": 8.723529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.049131155014038, "kl": 0.010116247227415442, "learning_rate": 6.934738232961726e-07, "loss": 0.00010015350562753156, "reward": 0.9302083253860474, "reward_std": 0.09717614203691483, "rewards/DrugCombAccuracyCOTORM/mean": 0.9166666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.18257419764995575, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 5932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 489.375, "completions/min_length": 437.0, "epoch": 8.725, "frac_reward_zero_std": 0.5, "grad_norm": 0.8731973171234131, "kl": 0.008111469214782119, "learning_rate": 6.933554808184103e-07, "loss": 8.109495684038848e-05, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 5933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/mean_length": 419.6875, "completions/min_length": 386.0, "epoch": 8.726470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.01487426646053791, "kl": 0.008609263109974563, "learning_rate": 6.932371256028782e-07, "loss": 8.559017442166805e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/mean_length": 525.1875, "completions/min_length": 435.0, "epoch": 8.727941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 0.9636110067367554, "kl": 0.01609006687067449, "learning_rate": 6.931187576573733e-07, "loss": 0.00016392022371292114, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 448.0, "completions/min_length": 396.0, "epoch": 8.729411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.014488586224615574, "kl": 0.00821547769010067, "learning_rate": 6.930003769896935e-07, "loss": 8.263612835435197e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 425.0625, "completions/min_length": 380.0, "epoch": 8.730882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.010979941114783287, "kl": 0.008946117828600109, "learning_rate": 6.92881983607637e-07, "loss": 8.93282558536157e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 458.0, "completions/min_length": 401.0, "epoch": 8.73235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0275640487670898, "kl": 0.011472102953121066, "learning_rate": 6.927635775190037e-07, "loss": 0.00011440739035606384, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 451.875, "completions/min_length": 371.0, "epoch": 8.733823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.0460025072097778, "kl": 0.010714623611420393, "learning_rate": 6.926451587315937e-07, "loss": 0.00010862201452255249, "reward": 0.5422499775886536, "reward_std": 0.0643245130777359, "rewards/DrugCombAccuracyCOTORM/mean": 0.5137500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.5050000548362732, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3125, "rewards/DrugCombCoverageCOTORM/std": 0.9227073788642883, "step": 5939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 460.625, "completions/min_length": 431.0, "epoch": 8.735294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 1.1622631549835205, "kl": 0.015755360946059227, "learning_rate": 6.925267272532082e-07, "loss": 0.00015840629930607975, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 474.3125, "completions/min_length": 424.0, "epoch": 8.736764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.022367004305124283, "kl": 0.009257698897272348, "learning_rate": 6.924082830916492e-07, "loss": 9.227243572240695e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 425.125, "completions/min_length": 353.0, "epoch": 8.738235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.0150515241548419, "kl": 0.008698524092324078, "learning_rate": 6.922898262547195e-07, "loss": 8.619290019851178e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 441.0625, "completions/min_length": 379.0, "epoch": 8.739705882352942, "frac_reward_zero_std": 0.0, "grad_norm": 1.3333816528320312, "kl": 0.010745628969743848, "learning_rate": 6.921713567502226e-07, "loss": 0.00010786950588226318, "reward": 0.6000000238418579, "reward_std": 0.4742809236049652, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 5943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 443.3125, "completions/min_length": 380.0, "epoch": 8.741176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.1049597263336182, "kl": 0.01098277559503913, "learning_rate": 6.920528745859629e-07, "loss": 0.00011071562767028809, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 5944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 450.0, "completions/min_length": 404.0, "epoch": 8.742647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.9603543281555176, "kl": 0.01285936078056693, "learning_rate": 6.919343797697459e-07, "loss": 0.00012876838445663452, "reward": 0.7250000238418579, "reward_std": 0.14880476891994476, "rewards/DrugCombAccuracyCOTORM/mean": 0.65625, "rewards/DrugCombAccuracyCOTORM/std": 0.4366062581539154, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 486.8125, "completions/min_length": 428.0, "epoch": 8.744117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.013660267926752567, "kl": 0.008786120335571468, "learning_rate": 6.918158723093778e-07, "loss": 8.811688894638792e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 5946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 436.1875, "completions/min_length": 365.0, "epoch": 8.745588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.01152346096932888, "kl": 0.00855579855851829, "learning_rate": 6.916973522126653e-07, "loss": 8.528977923560888e-05, "reward": 0.7666666507720947, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.3442651927471161, "step": 5947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/mean_length": 526.6875, "completions/min_length": 439.0, "epoch": 8.74705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.842743992805481, "kl": 0.011153241153806448, "learning_rate": 6.915788194874165e-07, "loss": 0.00011169910430908203, "reward": 0.5089166760444641, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.45125001668930054, "rewards/DrugCombAccuracyCOTORM/std": 0.502684473991394, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5013870000839233, "step": 5948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 454.0, "completions/min_length": 384.0, "epoch": 8.748529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.879675030708313, "kl": 0.013634195318445563, "learning_rate": 6.9146027414144e-07, "loss": 0.00013363361358642578, "reward": 0.875, "reward_std": 0.1035098284482956, "rewards/DrugCombAccuracyCOTORM/mean": 0.84375, "rewards/DrugCombAccuracyCOTORM/std": 0.23935678601264954, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 475.25, "completions/min_length": 416.0, "epoch": 8.75, "frac_reward_zero_std": 1.0, "grad_norm": 0.015572953969240189, "kl": 0.01091877045109868, "learning_rate": 6.913417161825449e-07, "loss": 0.00010906362149398774, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 492.3125, "completions/min_length": 429.0, "epoch": 8.751470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.2099475860595703, "kl": 0.011259421007707715, "learning_rate": 6.912231456185418e-07, "loss": 0.00011257454752922058, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 450.1875, "completions/min_length": 406.0, "epoch": 8.75294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.010918148793280125, "kl": 0.007621638244017959, "learning_rate": 6.911045624572419e-07, "loss": 7.62633717386052e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 455.1875, "completions/min_length": 402.0, "epoch": 8.754411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.8385002613067627, "kl": 0.012054081540554762, "learning_rate": 6.909859667064571e-07, "loss": 0.00012112408876419067, "reward": 0.824999988079071, "reward_std": 0.24348658323287964, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 5953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/mean_length": 496.3125, "completions/min_length": 394.0, "epoch": 8.755882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.013766501098871231, "kl": 0.00818721775431186, "learning_rate": 6.90867358374e-07, "loss": 8.190055086743087e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 452.75, "completions/min_length": 416.0, "epoch": 8.757352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.885696291923523, "kl": 0.0105041234055534, "learning_rate": 6.907487374676844e-07, "loss": 0.00010442744678584859, "reward": 0.6241832971572876, "reward_std": 0.1333603411912918, "rewards/DrugCombAccuracyCOTORM/mean": 0.5557500123977661, "rewards/DrugCombAccuracyCOTORM/std": 0.4052073657512665, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7958333492279053, "rewards/DrugCombCoverageCOTORM/std": 0.19696775078773499, "step": 5955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 481.6875, "completions/min_length": 402.0, "epoch": 8.758823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.0315582752227783, "kl": 0.014523548074066639, "learning_rate": 6.906301039953247e-07, "loss": 0.00014641880989074707, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 5956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 478.5625, "completions/min_length": 404.0, "epoch": 8.760294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.9571710824966431, "kl": 0.010758280055597425, "learning_rate": 6.905114579647362e-07, "loss": 0.0001081346272258088, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 5957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 466.625, "completions/min_length": 378.0, "epoch": 8.761764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.3904415369033813, "kl": 0.012449894100427628, "learning_rate": 6.90392799383735e-07, "loss": 0.00012529641389846802, "reward": 0.648900032043457, "reward_std": 0.42800652980804443, "rewards/DrugCombAccuracyCOTORM/mean": 0.5954999923706055, "rewards/DrugCombAccuracyCOTORM/std": 0.4757991135120392, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7250000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.5310367345809937, "step": 5958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 420.9375, "completions/min_length": 349.0, "epoch": 8.763235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0308626890182495, "kl": 0.011641676537692547, "learning_rate": 6.902741282601379e-07, "loss": 0.00011641532182693481, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 443.75, "completions/min_length": 408.0, "epoch": 8.764705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.026850605383515358, "kl": 0.008329512202180922, "learning_rate": 6.901554446017629e-07, "loss": 8.325684757437557e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 441.3125, "completions/min_length": 399.0, "epoch": 8.766176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.7810308933258057, "kl": 0.014139126753434539, "learning_rate": 6.900367484164282e-07, "loss": 0.00013874168507754803, "reward": 0.9011041522026062, "reward_std": 0.1865801066160202, "rewards/DrugCombAccuracyCOTORM/mean": 0.8887500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.30663496255874634, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 5961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/mean_length": 524.5625, "completions/min_length": 419.0, "epoch": 8.76764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8690701723098755, "kl": 0.012757177464663982, "learning_rate": 6.899180397119536e-07, "loss": 0.00012797117233276367, "reward": 0.8655833601951599, "reward_std": 0.05750651657581329, "rewards/DrugCombAccuracyCOTORM/mean": 0.8384895920753479, "rewards/DrugCombAccuracyCOTORM/std": 0.19306111335754395, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9479166269302368, "rewards/DrugCombCoverageCOTORM/std": 0.07978560030460358, "step": 5962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 492.125, "completions/min_length": 430.0, "epoch": 8.769117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 1.541763424873352, "kl": 0.010685713961720467, "learning_rate": 6.897993184961592e-07, "loss": 0.00010497123003005981, "reward": 0.5687500238418579, "reward_std": 0.0883883386850357, "rewards/DrugCombAccuracyCOTORM/mean": 0.4791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.5013870000839233, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5013870000839233, "step": 5963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/mean_length": 471.75, "completions/min_length": 397.0, "epoch": 8.770588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.979560911655426, "kl": 0.012838930124416947, "learning_rate": 6.896805847768659e-07, "loss": 0.00012724846601486206, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 512.3125, "completions/min_length": 437.0, "epoch": 8.772058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 0.820330023765564, "kl": 0.011365540092810988, "learning_rate": 6.895618385618959e-07, "loss": 0.00011363848170731217, "reward": 0.6535699963569641, "reward_std": 0.1441650390625, "rewards/DrugCombAccuracyCOTORM/mean": 0.5778999924659729, "rewards/DrugCombAccuracyCOTORM/std": 0.4978097677230835, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9125000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.10246950387954712, "step": 5965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 427.25, "completions/min_length": 378.0, "epoch": 8.773529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.013722887262701988, "kl": 0.009079171810299158, "learning_rate": 6.894430798590716e-07, "loss": 9.09025693545118e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 466.25, "completions/min_length": 414.0, "epoch": 8.775, "frac_reward_zero_std": 0.0, "grad_norm": 1.4716439247131348, "kl": 0.013965916587039828, "learning_rate": 6.893243086762165e-07, "loss": 0.0001395493745803833, "reward": 0.3375000059604645, "reward_std": 0.2655639052391052, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 5967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/mean_length": 404.3125, "completions/min_length": 383.0, "epoch": 8.776470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.007709929253906012, "kl": 0.0069365971721708775, "learning_rate": 6.892055250211551e-07, "loss": 6.923091859789565e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 464.9375, "completions/min_length": 380.0, "epoch": 8.777941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.0887107253074646, "kl": 0.014552680542692542, "learning_rate": 6.890867289017126e-07, "loss": 0.000147344617289491, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 450.4375, "completions/min_length": 402.0, "epoch": 8.779411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.029604027047753334, "kl": 0.009704236988909543, "learning_rate": 6.889679203257149e-07, "loss": 9.700098598841578e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 455.0625, "completions/min_length": 384.0, "epoch": 8.780882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.145795464515686, "kl": 0.0126345781609416, "learning_rate": 6.888490993009889e-07, "loss": 0.00012574523861985654, "reward": 0.8529167175292969, "reward_std": 0.14259988069534302, "rewards/DrugCombAccuracyCOTORM/mean": 0.8500000238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.24765567481517792, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7291666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.49018141627311707, "step": 5971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/mean_length": 500.625, "completions/min_length": 434.0, "epoch": 8.782352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.3790205717086792, "kl": 0.009744073497131467, "learning_rate": 6.887302658353621e-07, "loss": 9.709596633911133e-05, "reward": 0.887499988079071, "reward_std": 0.318198025226593, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 5972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 469.9375, "completions/min_length": 414.0, "epoch": 8.783823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.015973661094903946, "kl": 0.006926276255398989, "learning_rate": 6.88611419936663e-07, "loss": 6.883257447043434e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/mean_length": 517.0, "completions/min_length": 442.0, "epoch": 8.785294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.427260160446167, "kl": 0.014118594815954566, "learning_rate": 6.884925616127207e-07, "loss": 0.00014191865921020508, "reward": 0.2956569194793701, "reward_std": 0.1761915236711502, "rewards/DrugCombAccuracyCOTORM/mean": 0.1888854205608368, "rewards/DrugCombAccuracyCOTORM/std": 0.25126680731773376, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4454861283302307, "rewards/DrugCombCoverageCOTORM/std": 0.41473859548568726, "step": 5974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 449.625, "completions/min_length": 391.0, "epoch": 8.786764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0857278108596802, "kl": 0.012049643555656075, "learning_rate": 6.883736908713657e-07, "loss": 0.0001200363039970398, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/mean_length": 515.6875, "completions/min_length": 395.0, "epoch": 8.788235294117648, "frac_reward_zero_std": 0.0, "grad_norm": 1.3122897148132324, "kl": 0.010781667195260525, "learning_rate": 6.882548077204284e-07, "loss": 0.00010881572961807251, "reward": 0.8413749933242798, "reward_std": 0.27461081743240356, "rewards/DrugCombAccuracyCOTORM/mean": 0.8290624618530273, "rewards/DrugCombAccuracyCOTORM/std": 0.322460412979126, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.78125, "rewards/DrugCombCoverageCOTORM/std": 0.5189688801765442, "step": 5976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 436.75, "completions/min_length": 365.0, "epoch": 8.78970588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.011197308078408241, "kl": 0.008622222463600338, "learning_rate": 6.881359121677412e-07, "loss": 8.615794649813324e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 439.8125, "completions/min_length": 355.0, "epoch": 8.791176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.117517352104187, "kl": 0.012832963606342673, "learning_rate": 6.880170042211359e-07, "loss": 0.00012862091534771025, "reward": 0.30000001192092896, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 5978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 427.8125, "completions/min_length": 395.0, "epoch": 8.79264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.04023687168955803, "kl": 0.00999778846744448, "learning_rate": 6.878980838884462e-07, "loss": 0.00010007491800934076, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 489.125, "completions/min_length": 432.0, "epoch": 8.794117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.975920557975769, "kl": 0.012096273596398532, "learning_rate": 6.877791511775063e-07, "loss": 0.00012149661779403687, "reward": 0.4062500298023224, "reward_std": 0.04134397208690643, "rewards/DrugCombAccuracyCOTORM/mean": 0.3958333432674408, "rewards/DrugCombAccuracyCOTORM/std": 0.41488510370254517, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.1041666567325592, "rewards/DrugCombCoverageCOTORM/std": 0.932688295841217, "step": 5980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 473.4375, "completions/min_length": 414.0, "epoch": 8.795588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.7533567547798157, "kl": 0.007932259235531092, "learning_rate": 6.87660206096151e-07, "loss": 7.914958405308425e-05, "reward": 0.6291666626930237, "reward_std": 0.15242744982242584, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.45338237285614014, "step": 5981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 473.5625, "completions/min_length": 405.0, "epoch": 8.797058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.013157087378203869, "kl": 0.009722894988954067, "learning_rate": 6.875412486522164e-07, "loss": 9.739748202264309e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 471.5, "completions/min_length": 429.0, "epoch": 8.798529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.01683523878455162, "kl": 0.010621840134263039, "learning_rate": 6.87422278853539e-07, "loss": 0.00010590124293230474, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/mean_length": 421.0, "completions/min_length": 384.0, "epoch": 8.8, "frac_reward_zero_std": 1.0, "grad_norm": 0.012978590093553066, "kl": 0.009982171468436718, "learning_rate": 6.87303296707956e-07, "loss": 9.957587462849915e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 464.5, "completions/min_length": 383.0, "epoch": 8.801470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 0.7071433663368225, "kl": 0.0083137396723032, "learning_rate": 6.871843022233059e-07, "loss": 8.287252421723679e-05, "reward": 0.7819166779518127, "reward_std": 0.15579043328762054, "rewards/DrugCombAccuracyCOTORM/mean": 0.7404166460037231, "rewards/DrugCombAccuracyCOTORM/std": 0.3676992952823639, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.15957117080688477, "step": 5985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 478.9375, "completions/min_length": 400.0, "epoch": 8.802941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.0444369316101074, "kl": 0.013951567350886762, "learning_rate": 6.870652954074277e-07, "loss": 0.000139732874231413, "reward": 0.6974999904632568, "reward_std": 0.18683454394340515, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9750000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.10000000149011612, "step": 5986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 483.3125, "completions/min_length": 403.0, "epoch": 8.804411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 1.3895148038864136, "kl": 0.01442406396381557, "learning_rate": 6.869462762681612e-07, "loss": 0.00014545023441314697, "reward": 0.5839166641235352, "reward_std": 0.4519888162612915, "rewards/DrugCombAccuracyCOTORM/mean": 0.5137500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.5050000548362732, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7291666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.6800735592842102, "step": 5987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 441.0, "completions/min_length": 390.0, "epoch": 8.805882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.838612675666809, "kl": 0.013182551716454327, "learning_rate": 6.868272448133471e-07, "loss": 0.00013168207078706473, "reward": 0.695687472820282, "reward_std": 0.13157731294631958, "rewards/DrugCombAccuracyCOTORM/mean": 0.6449999809265137, "rewards/DrugCombAccuracyCOTORM/std": 0.4190465211868286, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.2713136672973633, "step": 5988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/mean_length": 488.375, "completions/min_length": 382.0, "epoch": 8.80735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9807100296020508, "kl": 0.00921391707379371, "learning_rate": 6.86708201050827e-07, "loss": 9.190663695335388e-05, "reward": 0.8517667055130005, "reward_std": 0.1738913208246231, "rewards/DrugCombAccuracyCOTORM/mean": 0.825124979019165, "rewards/DrugCombAccuracyCOTORM/std": 0.3314368724822998, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.18257419764995575, "step": 5989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 472.9375, "completions/min_length": 424.0, "epoch": 8.808823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.9367679953575134, "kl": 0.012081226333975792, "learning_rate": 6.86589144988443e-07, "loss": 0.00012132077245041728, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 375.6875, "completions/min_length": 318.0, "epoch": 8.810294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.03637773171067238, "kl": 0.01191672938875854, "learning_rate": 6.864700766340383e-07, "loss": 0.00011865394480992109, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 465.375, "completions/min_length": 439.0, "epoch": 8.811764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.012051276862621307, "kl": 0.009427101933397353, "learning_rate": 6.86350995995457e-07, "loss": 9.392833453603089e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 448.25, "completions/min_length": 392.0, "epoch": 8.813235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 1.166916847229004, "kl": 0.012546409154310822, "learning_rate": 6.862319030805436e-07, "loss": 0.00012521377357188612, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 451.5625, "completions/min_length": 393.0, "epoch": 8.814705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9304104447364807, "kl": 0.012523964163847268, "learning_rate": 6.861127978971438e-07, "loss": 0.00012418394908308983, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 431.5, "completions/min_length": 357.0, "epoch": 8.816176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.013108287937939167, "kl": 0.008146674372255802, "learning_rate": 6.859936804531038e-07, "loss": 8.168812928488478e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 455.5, "completions/min_length": 399.0, "epoch": 8.81764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.017453668639063835, "kl": 0.008551087346859276, "learning_rate": 6.858745507562708e-07, "loss": 8.573185914428905e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 485.25, "completions/min_length": 424.0, "epoch": 8.819117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.9601888656616211, "kl": 0.010494708782061934, "learning_rate": 6.857554088144928e-07, "loss": 0.0001048888370860368, "reward": 0.565750002861023, "reward_std": 0.0765974298119545, "rewards/DrugCombAccuracyCOTORM/mean": 0.5274999737739563, "rewards/DrugCombAccuracyCOTORM/std": 0.49293002486228943, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.8668269515037537, "step": 5997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 486.0625, "completions/min_length": 423.0, "epoch": 8.820588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.5985549092292786, "kl": 0.008183787576854229, "learning_rate": 6.856362546356185e-07, "loss": 8.175655966624618e-05, "reward": 0.831250011920929, "reward_std": 0.10415475070476532, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 5998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 467.1875, "completions/min_length": 369.0, "epoch": 8.822058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.013446440920233727, "kl": 0.006781390286050737, "learning_rate": 6.855170882274977e-07, "loss": 6.81940873619169e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 5999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 452.875, "completions/min_length": 376.0, "epoch": 8.823529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.008812233805656433, "kl": 0.007230115821585059, "learning_rate": 6.853979095979803e-07, "loss": 7.193323835963383e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/mean_length": 487.8125, "completions/min_length": 389.0, "epoch": 8.825, "frac_reward_zero_std": 0.0, "grad_norm": 1.3848176002502441, "kl": 0.012621462577953935, "learning_rate": 6.852787187549181e-07, "loss": 0.00012657791376113892, "reward": 0.8551250100135803, "reward_std": 0.3120896816253662, "rewards/DrugCombAccuracyCOTORM/mean": 0.8228124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.38252657651901245, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 6001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 462.4375, "completions/min_length": 404.0, "epoch": 8.826470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.03950492665171623, "kl": 0.012498654890805483, "learning_rate": 6.851595157061627e-07, "loss": 0.00012610301200766116, "reward": 0.8416666984558105, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.8333333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.17213258147239685, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 6002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 436.375, "completions/min_length": 389.0, "epoch": 8.827941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.01709742471575737, "kl": 0.008784859790466726, "learning_rate": 6.85040300459567e-07, "loss": 8.806232654023916e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/mean_length": 519.75, "completions/min_length": 451.0, "epoch": 8.829411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 1.21571683883667, "kl": 0.012201928533613682, "learning_rate": 6.849210730229846e-07, "loss": 0.00012179464101791382, "reward": 0.5217361450195312, "reward_std": 0.2270238995552063, "rewards/DrugCombAccuracyCOTORM/mean": 0.42847222089767456, "rewards/DrugCombAccuracyCOTORM/std": 0.46352240443229675, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7895833253860474, "rewards/DrugCombCoverageCOTORM/std": 0.2334027737379074, "step": 6004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 434.0625, "completions/min_length": 390.0, "epoch": 8.830882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.009640057571232319, "kl": 0.007987395976670086, "learning_rate": 6.848018334042697e-07, "loss": 7.969618309289217e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 473.625, "completions/min_length": 428.0, "epoch": 8.83235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.7812581062316895, "kl": 0.010593732935376465, "learning_rate": 6.846825816112778e-07, "loss": 0.00010579955414868891, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 448.6875, "completions/min_length": 398.0, "epoch": 8.833823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.009255290031433, "kl": 0.009900040924549103, "learning_rate": 6.84563317651865e-07, "loss": 9.911789675243199e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 6007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 494.5, "completions/min_length": 415.0, "epoch": 8.83529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.7776572108268738, "kl": 0.007708074641413987, "learning_rate": 6.844440415338878e-07, "loss": 7.697778346482664e-05, "reward": 0.7516499757766724, "reward_std": 0.16868963837623596, "rewards/DrugCombAccuracyCOTORM/mean": 0.7156041860580444, "rewards/DrugCombAccuracyCOTORM/std": 0.3889705240726471, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/mean_length": 492.625, "completions/min_length": 387.0, "epoch": 8.836764705882352, "frac_reward_zero_std": 0.5, "grad_norm": 3.679659843444824, "kl": 0.008966088993474841, "learning_rate": 6.843247532652038e-07, "loss": 8.95283737918362e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/mean_length": 491.5625, "completions/min_length": 399.0, "epoch": 8.838235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.965768039226532, "kl": 0.010012633865699172, "learning_rate": 6.842054528536716e-07, "loss": 0.00010006243974203244, "reward": 0.8333333730697632, "reward_std": 0.0942809209227562, "rewards/DrugCombAccuracyCOTORM/mean": 0.7916666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.2687419056892395, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 465.25, "completions/min_length": 412.0, "epoch": 8.839705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.011253693141043186, "kl": 0.009097306057810783, "learning_rate": 6.840861403071503e-07, "loss": 9.12719260668382e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 495.625, "completions/min_length": 432.0, "epoch": 8.841176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.9724794030189514, "kl": 0.013094593305140734, "learning_rate": 6.839668156334999e-07, "loss": 0.00012964407505933195, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 467.8125, "completions/min_length": 402.0, "epoch": 8.842647058823529, "frac_reward_zero_std": 0.0, "grad_norm": 1.2714734077453613, "kl": 0.014553607441484928, "learning_rate": 6.838474788405814e-07, "loss": 0.00014767050743103027, "reward": 0.8089166879653931, "reward_std": 0.3232209086418152, "rewards/DrugCombAccuracyCOTORM/mean": 0.7637500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.42547035217285156, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 6013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 443.5625, "completions/min_length": 383.0, "epoch": 8.844117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.01934564672410488, "kl": 0.008987096603959799, "learning_rate": 6.837281299362561e-07, "loss": 8.868299482855946e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 507.0, "completions/min_length": 440.0, "epoch": 8.845588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.73011314868927, "kl": 0.010385045083239675, "learning_rate": 6.836087689283866e-07, "loss": 0.00010449439287185669, "reward": 0.4729166626930237, "reward_std": 0.24331235885620117, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7291666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.57373046875, "step": 6015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 451.0625, "completions/min_length": 384.0, "epoch": 8.847058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.8902578949928284, "kl": 0.011923165060579777, "learning_rate": 6.83489395824836e-07, "loss": 0.00011830031871795654, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 422.375, "completions/min_length": 361.0, "epoch": 8.848529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.014422331936657429, "kl": 0.0085058375261724, "learning_rate": 6.833700106334685e-07, "loss": 8.557256660424173e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/mean_length": 462.125, "completions/min_length": 420.0, "epoch": 8.85, "frac_reward_zero_std": 0.5, "grad_norm": 0.9568339586257935, "kl": 0.01109027466736734, "learning_rate": 6.832506133621486e-07, "loss": 0.00011069636093452573, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 472.5625, "completions/min_length": 403.0, "epoch": 8.851470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.384231686592102, "kl": 0.023436357616446912, "learning_rate": 6.831312040187423e-07, "loss": 0.00023578258696943521, "reward": 0.7762500047683716, "reward_std": 0.1381015032529831, "rewards/DrugCombAccuracyCOTORM/mean": 0.7281249761581421, "rewards/DrugCombAccuracyCOTORM/std": 0.36249998211860657, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.08333335071802139, "step": 6019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 429.1875, "completions/min_length": 326.0, "epoch": 8.852941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 1.1887999773025513, "kl": 0.011383720440790057, "learning_rate": 6.830117826111155e-07, "loss": 0.00011397434718674049, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 477.875, "completions/min_length": 431.0, "epoch": 8.854411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.473154067993164, "kl": 0.014515425777062774, "learning_rate": 6.828923491471356e-07, "loss": 0.0001447051763534546, "reward": 0.550000011920929, "reward_std": 0.4208286702632904, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 425.0625, "completions/min_length": 321.0, "epoch": 8.855882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 15.509500503540039, "kl": 0.3391801102552563, "learning_rate": 6.827729036346706e-07, "loss": 0.0033785246778279543, "reward": 0.8516666889190674, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.8333333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.17213258147239685, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8500000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.1549193412065506, "step": 6022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 447.3125, "completions/min_length": 397.0, "epoch": 8.85735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.009477891959249973, "kl": 0.008400244056247175, "learning_rate": 6.826534460815892e-07, "loss": 8.393670577788725e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 439.0, "completions/min_length": 388.0, "epoch": 8.858823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.01578674651682377, "kl": 0.009850409463979304, "learning_rate": 6.825339764957611e-07, "loss": 9.939220763044432e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 457.25, "completions/min_length": 412.0, "epoch": 8.860294117647058, "frac_reward_zero_std": 0.0, "grad_norm": 1.2346830368041992, "kl": 0.010656865779310465, "learning_rate": 6.824144948850567e-07, "loss": 0.00010586529970169067, "reward": 0.8937499523162842, "reward_std": 0.3005203604698181, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 407.5625, "completions/min_length": 341.0, "epoch": 8.861764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.014670554548501968, "kl": 0.010181050281971693, "learning_rate": 6.822950012573468e-07, "loss": 0.00010205880244029686, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 411.75, "completions/min_length": 340.0, "epoch": 8.863235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.8099536895751953, "kl": 0.012976422905921936, "learning_rate": 6.821754956205037e-07, "loss": 0.0001296751288464293, "reward": 0.762499988079071, "reward_std": 0.25599944591522217, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 6027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/mean_length": 505.875, "completions/min_length": 384.0, "epoch": 8.864705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.122517466545105, "kl": 0.012204255908727646, "learning_rate": 6.820559779823998e-07, "loss": 0.00012345639697741717, "reward": 0.49446815252304077, "reward_std": 0.07894683629274368, "rewards/DrugCombAccuracyCOTORM/mean": 0.4352726638317108, "rewards/DrugCombAccuracyCOTORM/std": 0.46755725145339966, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4625000059604645, "rewards/DrugCombCoverageCOTORM/std": 0.4869976341724396, "step": 6028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/mean_length": 418.3125, "completions/min_length": 378.0, "epoch": 8.866176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.020373007282614708, "kl": 0.009863412007689476, "learning_rate": 6.819364483509089e-07, "loss": 9.850243804976344e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/mean_length": 402.0, "completions/min_length": 364.0, "epoch": 8.867647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.01569298654794693, "kl": 0.010646817274391651, "learning_rate": 6.818169067339051e-07, "loss": 0.00010545728582656011, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 428.9375, "completions/min_length": 384.0, "epoch": 8.869117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.007365607190877199, "kl": 0.006206694641150534, "learning_rate": 6.816973531392638e-07, "loss": 6.214412132976577e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 461.8125, "completions/min_length": 395.0, "epoch": 8.870588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.6243072748184204, "kl": 0.010960318148136139, "learning_rate": 6.815777875748606e-07, "loss": 0.00010970979928970337, "reward": 0.84375, "reward_std": 0.3442630469799042, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 464.625, "completions/min_length": 396.0, "epoch": 8.87205882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.009222187101840973, "kl": 0.007243741070851684, "learning_rate": 6.814582100485722e-07, "loss": 7.233069482026622e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/mean_length": 492.0625, "completions/min_length": 382.0, "epoch": 8.873529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.8138043880462646, "kl": 0.008269472746178508, "learning_rate": 6.813386205682763e-07, "loss": 8.328811236424372e-05, "reward": 0.9259583353996277, "reward_std": 0.12694284319877625, "rewards/DrugCombAccuracyCOTORM/mean": 0.9113541841506958, "rewards/DrugCombAccuracyCOTORM/std": 0.2228761613368988, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 6034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 491.5625, "completions/min_length": 360.0, "epoch": 8.875, "frac_reward_zero_std": 1.0, "grad_norm": 0.01123413722962141, "kl": 0.010551453568041325, "learning_rate": 6.812190191418508e-07, "loss": 0.00010491267312318087, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 431.625, "completions/min_length": 403.0, "epoch": 8.876470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.012529762461781502, "kl": 0.008669774630106986, "learning_rate": 6.810994057771751e-07, "loss": 8.667970541864634e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 475.125, "completions/min_length": 430.0, "epoch": 8.87794117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.020573463290929794, "kl": 0.012448783498257399, "learning_rate": 6.809797804821288e-07, "loss": 0.00012438940757419914, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 462.8125, "completions/min_length": 411.0, "epoch": 8.879411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.8829947113990784, "kl": 0.010659153223969042, "learning_rate": 6.808601432645925e-07, "loss": 0.00010655820369720459, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 481.875, "completions/min_length": 383.0, "epoch": 8.880882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.0766218900680542, "kl": 0.010335733415558934, "learning_rate": 6.807404941324477e-07, "loss": 0.0001034960150718689, "reward": 0.8655833601951599, "reward_std": 0.05750651657581329, "rewards/DrugCombAccuracyCOTORM/mean": 0.8384895920753479, "rewards/DrugCombAccuracyCOTORM/std": 0.19306111335754395, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9479166269302368, "rewards/DrugCombCoverageCOTORM/std": 0.07978560030460358, "step": 6039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 466.1875, "completions/min_length": 417.0, "epoch": 8.882352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.05287390202283859, "kl": 0.012951593147590756, "learning_rate": 6.806208330935766e-07, "loss": 0.00012845371384173632, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/mean_length": 483.0, "completions/min_length": 375.0, "epoch": 8.883823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.0062681436538696, "kl": 0.012874202686361969, "learning_rate": 6.805011601558619e-07, "loss": 0.0001275561808142811, "reward": 0.9241829514503479, "reward_std": 0.1608095020055771, "rewards/DrugCombAccuracyCOTORM/mean": 0.9130411744117737, "rewards/DrugCombAccuracyCOTORM/std": 0.2623177170753479, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 456.0625, "completions/min_length": 410.0, "epoch": 8.885294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.02422870323061943, "kl": 0.009426979813724756, "learning_rate": 6.803814753271877e-07, "loss": 9.428757766727358e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 468.375, "completions/min_length": 399.0, "epoch": 8.886764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8463930487632751, "kl": 0.014211536617949605, "learning_rate": 6.802617786154385e-07, "loss": 0.0001430697739124298, "reward": 0.7250000238418579, "reward_std": 0.18322508037090302, "rewards/DrugCombAccuracyCOTORM/mean": 0.65625, "rewards/DrugCombAccuracyCOTORM/std": 0.4732423722743988, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/mean_length": 398.875, "completions/min_length": 373.0, "epoch": 8.888235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.014850695617496967, "kl": 0.007295674295164645, "learning_rate": 6.801420700284995e-07, "loss": 7.304962491616607e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/mean_length": 461.75, "completions/min_length": 354.0, "epoch": 8.889705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 0.7636812925338745, "kl": 0.010916172061115503, "learning_rate": 6.800223495742566e-07, "loss": 0.0001088753342628479, "reward": 0.6000000238418579, "reward_std": 0.16256865859031677, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.632455587387085, "step": 6045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 453.0625, "completions/min_length": 376.0, "epoch": 8.891176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.8637117147445679, "kl": 0.010412024217657745, "learning_rate": 6.799026172605969e-07, "loss": 0.00010456889867782593, "reward": 0.1535000056028366, "reward_std": 0.0353553369641304, "rewards/DrugCombAccuracyCOTORM/mean": 0.08250000327825546, "rewards/DrugCombAccuracyCOTORM/std": 0.08520564436912537, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.125, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 6046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 457.9375, "completions/min_length": 411.0, "epoch": 8.89264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.01438892725855112, "kl": 0.009602010366506875, "learning_rate": 6.797828730954082e-07, "loss": 9.644675446907058e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 446.9375, "completions/min_length": 376.0, "epoch": 8.894117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.1846424341201782, "kl": 0.01582087273709476, "learning_rate": 6.796631170865788e-07, "loss": 0.00015692785382270813, "reward": 0.9270833134651184, "reward_std": 0.1593482345342636, "rewards/DrugCombAccuracyCOTORM/mean": 0.9166666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 500.5, "completions/min_length": 445.0, "epoch": 8.895588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.8561849594116211, "kl": 0.009732260601595044, "learning_rate": 6.795433492419979e-07, "loss": 9.640306234359741e-05, "reward": 0.7651041746139526, "reward_std": 0.19553470611572266, "rewards/DrugCombAccuracyCOTORM/mean": 0.7395833730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.3989279568195343, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.40824830532073975, "step": 6049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 458.5625, "completions/min_length": 392.0, "epoch": 8.897058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.008836363442242146, "kl": 0.00680450804065913, "learning_rate": 6.794235695695554e-07, "loss": 6.774729263270274e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 460.625, "completions/min_length": 416.0, "epoch": 8.898529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9431944489479065, "kl": 0.012159909587353468, "learning_rate": 6.793037780771423e-07, "loss": 0.00012168288230895996, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 471.6875, "completions/min_length": 402.0, "epoch": 8.9, "frac_reward_zero_std": 0.5, "grad_norm": 0.9315484166145325, "kl": 0.010445474996231496, "learning_rate": 6.7918397477265e-07, "loss": 0.00010351322998758405, "reward": 0.75, "reward_std": 0.26726123690605164, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 6052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 467.0625, "completions/min_length": 449.0, "epoch": 8.901470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.01309286616742611, "kl": 0.008772031054832041, "learning_rate": 6.79064159663971e-07, "loss": 8.7422551587224e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 800.0, "completions/mean_length": 540.5, "completions/min_length": 374.0, "epoch": 8.902941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.8079600930213928, "kl": 0.008450034772977233, "learning_rate": 6.789443327589983e-07, "loss": 8.43927264213562e-05, "reward": 0.7654232978820801, "reward_std": 0.14653924107551575, "rewards/DrugCombAccuracyCOTORM/mean": 0.7131592631340027, "rewards/DrugCombAccuracyCOTORM/std": 0.3883221745491028, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9489583373069763, "rewards/DrugCombCoverageCOTORM/std": 0.0911792665719986, "step": 6054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 459.9375, "completions/min_length": 412.0, "epoch": 8.904411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.010421549901366234, "kl": 0.006788535276427865, "learning_rate": 6.788244940656258e-07, "loss": 6.794369255658239e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 440.8125, "completions/min_length": 394.0, "epoch": 8.905882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.010045924223959446, "kl": 0.007695219130255282, "learning_rate": 6.787046435917482e-07, "loss": 7.670991180930287e-05, "reward": 0.6713333129882812, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6100000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.40279027819633484, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 6056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 515.5, "completions/min_length": 457.0, "epoch": 8.907352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.382982611656189, "kl": 0.016130987089127302, "learning_rate": 6.785847813452609e-07, "loss": 0.00016014277935028076, "reward": 0.6147010326385498, "reward_std": 0.22871816158294678, "rewards/DrugCombAccuracyCOTORM/mean": 0.5347824692726135, "rewards/DrugCombAccuracyCOTORM/std": 0.3928855061531067, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8687499761581421, "rewards/DrugCombCoverageCOTORM/std": 0.15370427072048187, "step": 6057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 433.0625, "completions/min_length": 374.0, "epoch": 8.908823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 1.4149106740951538, "kl": 0.010749415261670947, "learning_rate": 6.784649073340601e-07, "loss": 0.00010664016008377075, "reward": 0.25558334589004517, "reward_std": 0.19983510673046112, "rewards/DrugCombAccuracyCOTORM/mean": 0.14499999582767487, "rewards/DrugCombAccuracyCOTORM/std": 0.25219041109085083, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3958333432674408, "rewards/DrugCombCoverageCOTORM/std": 0.36955931782722473, "step": 6058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 481.3125, "completions/min_length": 443.0, "epoch": 8.910294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.4194356203079224, "kl": 0.012187583139166236, "learning_rate": 6.783450215660429e-07, "loss": 0.0001214742660522461, "reward": 0.5639166831970215, "reward_std": 0.28586870431900024, "rewards/DrugCombAccuracyCOTORM/mean": 0.47833332419395447, "rewards/DrugCombAccuracyCOTORM/std": 0.4097515344619751, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.20069323480129242, "step": 6059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 467.75, "completions/min_length": 420.0, "epoch": 8.911764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.6685534715652466, "kl": 0.009946304839104414, "learning_rate": 6.78225124049107e-07, "loss": 9.823590517044067e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 446.4375, "completions/min_length": 367.0, "epoch": 8.913235294117648, "frac_reward_zero_std": 0.5, "grad_norm": 0.9202151894569397, "kl": 0.012371625052765012, "learning_rate": 6.781052147911509e-07, "loss": 0.00012393716315273196, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 444.875, "completions/min_length": 355.0, "epoch": 8.91470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.009095912799239159, "kl": 0.007960530347190797, "learning_rate": 6.779852938000741e-07, "loss": 7.975680637173355e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 452.375, "completions/min_length": 392.0, "epoch": 8.916176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.9143997430801392, "kl": 0.020780825288966298, "learning_rate": 6.778653610837765e-07, "loss": 0.00020569161279127002, "reward": 0.75, "reward_std": 0.26726123690605164, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 6063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 450.8125, "completions/min_length": 414.0, "epoch": 8.91764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9814043641090393, "kl": 0.01417256286367774, "learning_rate": 6.77745416650159e-07, "loss": 0.00014281272888183594, "reward": 0.8500000238418579, "reward_std": 0.2070196568965912, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 448.0, "completions/min_length": 359.0, "epoch": 8.919117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 1.432131290435791, "kl": 0.01068798964843154, "learning_rate": 6.776254605071232e-07, "loss": 0.000108258769614622, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 466.9375, "completions/min_length": 379.0, "epoch": 8.920588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.014964603818953037, "kl": 0.009922178112901747, "learning_rate": 6.775054926625718e-07, "loss": 9.922750177793205e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/mean_length": 413.625, "completions/min_length": 381.0, "epoch": 8.922058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.1332833766937256, "kl": 0.009517844649963081, "learning_rate": 6.773855131244076e-07, "loss": 9.500980377197266e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 480.25, "completions/min_length": 440.0, "epoch": 8.923529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.7841541767120361, "kl": 0.009262303821742535, "learning_rate": 6.772655219005348e-07, "loss": 9.269267320632935e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 472.0, "completions/min_length": 439.0, "epoch": 8.925, "frac_reward_zero_std": 0.5, "grad_norm": 1.2555394172668457, "kl": 0.02112262207083404, "learning_rate": 6.771455189988579e-07, "loss": 0.0002118721604347229, "reward": 0.9437500238418579, "reward_std": 0.13999362289905548, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 461.25, "completions/min_length": 407.0, "epoch": 8.926470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.01407614629715681, "kl": 0.007319068419747055, "learning_rate": 6.770255044272826e-07, "loss": 7.390011160168797e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/mean_length": 517.9375, "completions/min_length": 424.0, "epoch": 8.927941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.8272922039031982, "kl": 0.010100558283738792, "learning_rate": 6.76905478193715e-07, "loss": 0.00010099261999130249, "reward": 0.6244750022888184, "reward_std": 0.0830409824848175, "rewards/DrugCombAccuracyCOTORM/mean": 0.570437490940094, "rewards/DrugCombAccuracyCOTORM/std": 0.45772743225097656, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6812499761581421, "rewards/DrugCombCoverageCOTORM/std": 0.4069705307483673, "step": 6071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 447.5, "completions/min_length": 401.0, "epoch": 8.929411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.025812605395913124, "kl": 0.009583776234649122, "learning_rate": 6.767854403060623e-07, "loss": 9.659973147790879e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 424.5, "completions/min_length": 364.0, "epoch": 8.930882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.019505295902490616, "kl": 0.009951432468369603, "learning_rate": 6.766653907722319e-07, "loss": 9.931356180459261e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 488.8125, "completions/min_length": 421.0, "epoch": 8.93235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8814533948898315, "kl": 0.006520218215882778, "learning_rate": 6.765453296001329e-07, "loss": 6.508142541861162e-05, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 443.125, "completions/min_length": 399.0, "epoch": 8.933823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.014733310788869858, "kl": 0.007979381596669555, "learning_rate": 6.764252567976742e-07, "loss": 7.999682566151023e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 495.25, "completions/min_length": 431.0, "epoch": 8.935294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.06518123298883438, "kl": 0.0121945213759318, "learning_rate": 6.763051723727662e-07, "loss": 0.00012212579895276576, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 450.3125, "completions/min_length": 371.0, "epoch": 8.936764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 0.8218100070953369, "kl": 0.008551610866561532, "learning_rate": 6.761850763333196e-07, "loss": 8.533567597623914e-05, "reward": 0.574999988079071, "reward_std": 0.1752549111843109, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.25, "rewards/DrugCombCoverageCOTORM/std": 1.0, "step": 6077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 441.0, "completions/min_length": 400.0, "epoch": 8.938235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.010500332340598106, "kl": 0.007502168766222894, "learning_rate": 6.760649686872462e-07, "loss": 7.511522562708706e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 423.5, "completions/min_length": 343.0, "epoch": 8.939705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.018930036574602127, "kl": 0.010321906651370227, "learning_rate": 6.759448494424578e-07, "loss": 0.0001050017453962937, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 393.8125, "completions/min_length": 342.0, "epoch": 8.941176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.3572187423706055, "kl": 0.012523801298812032, "learning_rate": 6.758247186068683e-07, "loss": 0.00012227811384946108, "reward": 0.885937511920929, "reward_std": 0.2112291157245636, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 6080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/mean_length": 524.625, "completions/min_length": 418.0, "epoch": 8.94264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.147038221359253, "kl": 0.012446389999240637, "learning_rate": 6.757045761883913e-07, "loss": 0.0001257591793546453, "reward": 0.7510416507720947, "reward_std": 0.10460029542446136, "rewards/DrugCombAccuracyCOTORM/mean": 0.6979166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.356000155210495, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9270833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.1717960685491562, "step": 6081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 495.0625, "completions/min_length": 404.0, "epoch": 8.944117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.162787914276123, "kl": 0.009478079504333436, "learning_rate": 6.755844221949413e-07, "loss": 9.535506251268089e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 458.9375, "completions/min_length": 417.0, "epoch": 8.945588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.9898936748504639, "kl": 0.011132187908515334, "learning_rate": 6.75464256634434e-07, "loss": 0.00011083197023253888, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 438.375, "completions/min_length": 383.0, "epoch": 8.947058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.6476045846939087, "kl": 0.008941079955548048, "learning_rate": 6.753440795147856e-07, "loss": 8.94562472240068e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 445.6875, "completions/min_length": 371.0, "epoch": 8.948529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 0.8069668412208557, "kl": 0.013315446209162474, "learning_rate": 6.75223890843913e-07, "loss": 0.00013141179806552827, "reward": 0.8835000395774841, "reward_std": 0.01932758092880249, "rewards/DrugCombAccuracyCOTORM/mean": 0.8647916316986084, "rewards/DrugCombAccuracyCOTORM/std": 0.14349070191383362, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.08606630563735962, "step": 6085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 420.9375, "completions/min_length": 386.0, "epoch": 8.95, "frac_reward_zero_std": 1.0, "grad_norm": 0.00915154255926609, "kl": 0.008999216021038592, "learning_rate": 6.751036906297337e-07, "loss": 8.959631668403745e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 461.5625, "completions/min_length": 411.0, "epoch": 8.951470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8138048052787781, "kl": 0.010986068635247648, "learning_rate": 6.749834788801665e-07, "loss": 0.00011026114225387573, "reward": 0.9078333377838135, "reward_std": 0.13880041241645813, "rewards/DrugCombAccuracyCOTORM/mean": 0.8899999856948853, "rewards/DrugCombAccuracyCOTORM/std": 0.25806114077568054, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.07453560829162598, "step": 6087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/mean_length": 487.0, "completions/min_length": 401.0, "epoch": 8.952941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9001235961914062, "kl": 0.012971654301509261, "learning_rate": 6.748632556031305e-07, "loss": 0.00013033300638198853, "reward": 0.9802083373069763, "reward_std": 0.055979274213314056, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 6088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/mean_length": 523.625, "completions/min_length": 401.0, "epoch": 8.954411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.7945690155029297, "kl": 0.010680303676053882, "learning_rate": 6.747430208065458e-07, "loss": 0.00010801106691360474, "reward": 0.6712222099304199, "reward_std": 0.17323940992355347, "rewards/DrugCombAccuracyCOTORM/mean": 0.6133333444595337, "rewards/DrugCombAccuracyCOTORM/std": 0.485395610332489, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8055555820465088, "rewards/DrugCombCoverageCOTORM/std": 0.3382355570793152, "step": 6089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 481.0, "completions/min_length": 437.0, "epoch": 8.955882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 1.5130972862243652, "kl": 0.011696191038936377, "learning_rate": 6.746227744983332e-07, "loss": 0.00011643767356872559, "reward": 0.35891667008399963, "reward_std": 0.26104649901390076, "rewards/DrugCombAccuracyCOTORM/mean": 0.26375001668930054, "rewards/DrugCombAccuracyCOTORM/std": 0.4423705041408539, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.75, "step": 6090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 448.1875, "completions/min_length": 381.0, "epoch": 8.95735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.009263969026505947, "kl": 0.008126469678245485, "learning_rate": 6.74502516686414e-07, "loss": 8.133943629218265e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/mean_length": 399.375, "completions/min_length": 354.0, "epoch": 8.958823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.02573412097990513, "kl": 0.011329269385896623, "learning_rate": 6.743822473787107e-07, "loss": 0.00011287032248219475, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 448.25, "completions/min_length": 384.0, "epoch": 8.96029411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.6480466723442078, "kl": 0.007525111781433225, "learning_rate": 6.742619665831461e-07, "loss": 7.432095299009234e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 446.6875, "completions/min_length": 362.0, "epoch": 8.961764705882352, "frac_reward_zero_std": 0.5, "grad_norm": 0.9256318807601929, "kl": 0.0073330492014065385, "learning_rate": 6.741416743076443e-07, "loss": 7.441570778610185e-05, "reward": 0.7310428619384766, "reward_std": 0.12053678184747696, "rewards/DrugCombAccuracyCOTORM/mean": 0.6794285774230957, "rewards/DrugCombAccuracyCOTORM/std": 0.38365545868873596, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.1666666567325592, "step": 6094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 475.1875, "completions/min_length": 402.0, "epoch": 8.963235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1781455278396606, "kl": 0.009125954238697886, "learning_rate": 6.740213705601297e-07, "loss": 9.140282782027498e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 447.6875, "completions/min_length": 407.0, "epoch": 8.964705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.022813808172941208, "kl": 0.009036109549924731, "learning_rate": 6.739010553485275e-07, "loss": 9.001031867228448e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 471.5625, "completions/min_length": 405.0, "epoch": 8.966176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.3696935176849365, "kl": 0.012292398721911013, "learning_rate": 6.737807286807639e-07, "loss": 0.0001246035099029541, "reward": 0.5678583383560181, "reward_std": 0.04169582203030586, "rewards/DrugCombAccuracyCOTORM/mean": 0.5158125162124634, "rewards/DrugCombAccuracyCOTORM/std": 0.5018610954284668, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5520833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.5764474868774414, "step": 6097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 415.0, "completions/min_length": 375.0, "epoch": 8.967647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.009572433307766914, "kl": 0.008242624928243458, "learning_rate": 6.736603905647656e-07, "loss": 8.215329580707476e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 453.8125, "completions/min_length": 369.0, "epoch": 8.969117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.1590163707733154, "kl": 0.017699797870591283, "learning_rate": 6.735400410084602e-07, "loss": 0.0001756057026796043, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 464.0625, "completions/min_length": 429.0, "epoch": 8.970588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.4558228254318237, "kl": 0.013735318556427956, "learning_rate": 6.734196800197762e-07, "loss": 0.00013768300414085388, "reward": 0.737500011920929, "reward_std": 0.39058569073677063, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 444.5625, "completions/min_length": 391.0, "epoch": 8.972058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.012888164259493351, "kl": 0.007450197008438408, "learning_rate": 6.732993076066427e-07, "loss": 7.474535232177004e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 434.4375, "completions/min_length": 366.0, "epoch": 8.973529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.126457691192627, "kl": 0.008570995181798935, "learning_rate": 6.731789237769891e-07, "loss": 8.590164361521602e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 472.0625, "completions/min_length": 382.0, "epoch": 8.975, "frac_reward_zero_std": 0.5, "grad_norm": 1.1444895267486572, "kl": 0.014262647135183215, "learning_rate": 6.730585285387465e-07, "loss": 0.00014232668036129326, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 6103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 470.3125, "completions/min_length": 369.0, "epoch": 8.976470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8078023195266724, "kl": 0.00831624015700072, "learning_rate": 6.729381218998459e-07, "loss": 8.353704470209777e-05, "reward": 0.7945833206176758, "reward_std": 0.17010116577148438, "rewards/DrugCombAccuracyCOTORM/mean": 0.7562500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.3733965754508972, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.15957117080688477, "step": 6104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 427.375, "completions/min_length": 379.0, "epoch": 8.977941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 0.9966105818748474, "kl": 0.009838492260314524, "learning_rate": 6.728177038682195e-07, "loss": 9.796072845347226e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 434.75, "completions/min_length": 369.0, "epoch": 8.979411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.8777326941490173, "kl": 0.00792636361438781, "learning_rate": 6.726972744518001e-07, "loss": 7.931143045425415e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/mean_length": 499.875, "completions/min_length": 445.0, "epoch": 8.980882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.013737490400671959, "kl": 0.00993293127976358, "learning_rate": 6.725768336585213e-07, "loss": 9.82186829787679e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/mean_length": 402.875, "completions/min_length": 385.0, "epoch": 8.98235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.02170289307832718, "kl": 0.008834495674818754, "learning_rate": 6.724563814963173e-07, "loss": 8.84070759639144e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 454.25, "completions/min_length": 398.0, "epoch": 8.983823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.9370045065879822, "kl": 0.009079090435989201, "learning_rate": 6.723359179731235e-07, "loss": 9.08002257347107e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 434.5, "completions/min_length": 354.0, "epoch": 8.985294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.013526588678359985, "kl": 0.010431891190819442, "learning_rate": 6.722154430968754e-07, "loss": 0.00010447997192386538, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 450.125, "completions/min_length": 402.0, "epoch": 8.986764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.292544960975647, "kl": 0.011874435702338815, "learning_rate": 6.720949568755099e-07, "loss": 0.0001188516616821289, "reward": 0.6000000238418579, "reward_std": 0.2828426957130432, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 462.875, "completions/min_length": 395.0, "epoch": 8.988235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.6182072162628174, "kl": 0.011788023635745049, "learning_rate": 6.71974459316964e-07, "loss": 0.00011809170246124268, "reward": 0.8937499523162842, "reward_std": 0.3005203604698181, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/mean_length": 544.6875, "completions/min_length": 441.0, "epoch": 8.989705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 0.9360707402229309, "kl": 0.008947405382059515, "learning_rate": 6.718539504291761e-07, "loss": 8.958950638771057e-05, "reward": 0.48375001549720764, "reward_std": 0.09217500686645508, "rewards/DrugCombAccuracyCOTORM/mean": 0.41914063692092896, "rewards/DrugCombAccuracyCOTORM/std": 0.4595293700695038, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.484375, "rewards/DrugCombCoverageCOTORM/std": 0.5018196105957031, "step": 6113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 448.0, "completions/min_length": 403.0, "epoch": 8.991176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.8548461198806763, "kl": 0.014753568917512894, "learning_rate": 6.717334302200848e-07, "loss": 0.00014606118202209473, "reward": 0.8312499523162842, "reward_std": 0.36740854382514954, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 6114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 464.3125, "completions/min_length": 405.0, "epoch": 8.992647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.8346878290176392, "kl": 0.008305081748403609, "learning_rate": 6.716128986976296e-07, "loss": 8.355826139450073e-05, "reward": 0.9770833253860474, "reward_std": 0.06481810659170151, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 478.4375, "completions/min_length": 410.0, "epoch": 8.994117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.0140883922576904, "kl": 0.01210355176590383, "learning_rate": 6.714923558697511e-07, "loss": 0.00012007808254566044, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/mean_length": 519.375, "completions/min_length": 429.0, "epoch": 8.995588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.316969633102417, "kl": 0.013789327815175056, "learning_rate": 6.713718017443901e-07, "loss": 0.00013862550258636475, "reward": 0.6499999761581421, "reward_std": 0.4719901382923126, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.730296790599823, "step": 6117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 440.5, "completions/min_length": 393.0, "epoch": 8.99705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.010844849050045013, "kl": 0.0073155565187335014, "learning_rate": 6.712512363294885e-07, "loss": 7.295898831216618e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 445.9375, "completions/min_length": 404.0, "epoch": 8.998529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.024660907685756683, "kl": 0.010794076602905989, "learning_rate": 6.711306596329888e-07, "loss": 0.00010706786997616291, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/mean_length": 495.75, "completions/min_length": 389.0, "epoch": 9.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.017504066228866577, "kl": 0.008023263304494321, "learning_rate": 6.710100716628344e-07, "loss": 8.009567682165653e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 6120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 437.5, "completions/min_length": 386.0, "epoch": 9.001470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.01072354894131422, "kl": 0.008518525282852352, "learning_rate": 6.708894724269691e-07, "loss": 8.507372695021331e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 6121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 435.4375, "completions/min_length": 364.0, "epoch": 9.00294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.00961537566035986, "kl": 0.008291428675875068, "learning_rate": 6.70768861933338e-07, "loss": 8.315710874740034e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 430.875, "completions/min_length": 379.0, "epoch": 9.004411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.013372869230806828, "kl": 0.008404233492910862, "learning_rate": 6.706482401898863e-07, "loss": 8.361255459021777e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/mean_length": 522.125, "completions/min_length": 471.0, "epoch": 9.005882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.9296517968177795, "kl": 0.009239579900167882, "learning_rate": 6.705276072045604e-07, "loss": 9.229034185409546e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 6124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 434.125, "completions/min_length": 345.0, "epoch": 9.007352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.011597228236496449, "kl": 0.007114503649063408, "learning_rate": 6.704069629853075e-07, "loss": 7.126957643777132e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/mean_length": 541.9375, "completions/min_length": 455.0, "epoch": 9.008823529411766, "frac_reward_zero_std": 0.0, "grad_norm": 1.1915292739868164, "kl": 0.007944689248688519, "learning_rate": 6.702863075400749e-07, "loss": 8.016824722290039e-05, "reward": 0.8398333787918091, "reward_std": 0.2692755460739136, "rewards/DrugCombAccuracyCOTORM/mean": 0.8128125071525574, "rewards/DrugCombAccuracyCOTORM/std": 0.3379742205142975, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.1912434846162796, "step": 6126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 434.8125, "completions/min_length": 395.0, "epoch": 9.010294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.011773058213293552, "kl": 0.00886776193510741, "learning_rate": 6.701656408768115e-07, "loss": 8.924279973143712e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 494.4375, "completions/min_length": 445.0, "epoch": 9.011764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8102232217788696, "kl": 0.011683404678478837, "learning_rate": 6.700449630034662e-07, "loss": 0.000117044648504816, "reward": 0.625, "reward_std": 0.15811388194561005, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 6128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 450.625, "completions/min_length": 370.0, "epoch": 9.013235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8151599764823914, "kl": 0.011330039473250508, "learning_rate": 6.699242739279889e-07, "loss": 0.00011279321915935725, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 467.9375, "completions/min_length": 428.0, "epoch": 9.014705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.011311903595924377, "kl": 0.008749251253902912, "learning_rate": 6.698035736583306e-07, "loss": 8.753615111345425e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/mean_length": 507.5625, "completions/min_length": 369.0, "epoch": 9.016176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9154163599014282, "kl": 0.01113781426101923, "learning_rate": 6.696828622024428e-07, "loss": 0.00011230669042561203, "reward": 0.9825520515441895, "reward_std": 0.04935016110539436, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9921875, "rewards/DrugCombCoverageCOTORM/std": 0.03125, "step": 6131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/mean_length": 501.9375, "completions/min_length": 364.0, "epoch": 9.01764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.028043508529663, "kl": 0.014545805752277374, "learning_rate": 6.695621395682772e-07, "loss": 0.0001461956271668896, "reward": 0.8951666355133057, "reward_std": 0.13103264570236206, "rewards/DrugCombAccuracyCOTORM/mean": 0.8741666674613953, "rewards/DrugCombAccuracyCOTORM/std": 0.25863316655158997, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.07453560829162598, "step": 6132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/mean_length": 516.75, "completions/min_length": 451.0, "epoch": 9.019117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 1.5010480880737305, "kl": 0.012327692238613963, "learning_rate": 6.694414057637869e-07, "loss": 0.00012287870049476624, "reward": 0.5738294124603271, "reward_std": 0.366669237613678, "rewards/DrugCombAccuracyCOTORM/mean": 0.5037450790405273, "rewards/DrugCombAccuracyCOTORM/std": 0.42732149362564087, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7083333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.4962824881076813, "step": 6133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 459.5, "completions/min_length": 403.0, "epoch": 9.020588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.8559799194335938, "kl": 0.007797719212248921, "learning_rate": 6.693206607969257e-07, "loss": 7.789582014083862e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 494.5625, "completions/min_length": 420.0, "epoch": 9.022058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.009953737258911133, "kl": 0.008785593905486166, "learning_rate": 6.69199904675648e-07, "loss": 8.783441444393247e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 448.3125, "completions/min_length": 368.0, "epoch": 9.023529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.009899190627038479, "kl": 0.007634238689206541, "learning_rate": 6.690791374079085e-07, "loss": 7.575024937978014e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 459.125, "completions/min_length": 397.0, "epoch": 9.025, "frac_reward_zero_std": 0.5, "grad_norm": 1.2425024509429932, "kl": 0.01185198244638741, "learning_rate": 6.689583590016636e-07, "loss": 0.00011886656284332275, "reward": 0.574999988079071, "reward_std": 0.04629100486636162, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 6137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/mean_length": 526.3125, "completions/min_length": 406.0, "epoch": 9.026470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.2366671562194824, "kl": 0.011181417154148221, "learning_rate": 6.688375694648693e-07, "loss": 0.00011470812023617327, "reward": 0.9114583730697632, "reward_std": 0.0733194574713707, "rewards/DrugCombAccuracyCOTORM/mean": 0.8958333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.15957117080688477, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9479166269302368, "rewards/DrugCombCoverageCOTORM/std": 0.07978560030460358, "step": 6138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 451.5625, "completions/min_length": 357.0, "epoch": 9.027941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.013150324113667011, "kl": 0.011229091556742787, "learning_rate": 6.687167688054833e-07, "loss": 0.00011227537470404059, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 456.6875, "completions/min_length": 371.0, "epoch": 9.029411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.0066214995458722115, "kl": 0.005648992373608053, "learning_rate": 6.685959570314637e-07, "loss": 5.630151281366125e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/mean_length": 470.875, "completions/min_length": 410.0, "epoch": 9.030882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.011315837502479553, "kl": 0.007901097065769136, "learning_rate": 6.684751341507688e-07, "loss": 7.896393071860075e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 432.8125, "completions/min_length": 381.0, "epoch": 9.032352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.011073221452534199, "kl": 0.009609001921489835, "learning_rate": 6.683543001713587e-07, "loss": 9.596390736987814e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 503.75, "completions/min_length": 452.0, "epoch": 9.033823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.8582786321640015, "kl": 0.009324336424469948, "learning_rate": 6.682334551011934e-07, "loss": 9.297996439272538e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 423.125, "completions/min_length": 363.0, "epoch": 9.035294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.009628756903111935, "kl": 0.007677227375097573, "learning_rate": 6.681125989482337e-07, "loss": 7.702183211222291e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 442.375, "completions/min_length": 389.0, "epoch": 9.036764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9229041337966919, "kl": 0.010559633723460138, "learning_rate": 6.679917317204413e-07, "loss": 0.00010543316602706909, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 431.875, "completions/min_length": 369.0, "epoch": 9.038235294117648, "frac_reward_zero_std": 0.5, "grad_norm": 1.0984342098236084, "kl": 0.010429518064484, "learning_rate": 6.678708534257788e-07, "loss": 0.00010443106293678284, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/mean_length": 497.0, "completions/min_length": 391.0, "epoch": 9.03970588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.284260630607605, "kl": 0.010663488879799843, "learning_rate": 6.677499640722094e-07, "loss": 0.00010676681995391846, "reward": 0.7014047503471375, "reward_std": 0.3299766480922699, "rewards/DrugCombAccuracyCOTORM/mean": 0.6513094902038574, "rewards/DrugCombAccuracyCOTORM/std": 0.43200430274009705, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8035714626312256, "rewards/DrugCombCoverageCOTORM/std": 0.29450756311416626, "step": 6147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 446.4375, "completions/min_length": 397.0, "epoch": 9.041176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.8670542240142822, "kl": 0.007729011704213917, "learning_rate": 6.676290636676968e-07, "loss": 7.671117782592773e-05, "reward": 0.9551249742507935, "reward_std": 0.12692566215991974, "rewards/DrugCombAccuracyCOTORM/mean": 0.9478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.20874999463558197, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 6148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 443.25, "completions/min_length": 394.0, "epoch": 9.04264705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.5589630603790283, "kl": 0.012323241913691163, "learning_rate": 6.675081522202058e-07, "loss": 0.0001224130392074585, "reward": 0.5250000357627869, "reward_std": 0.2558746933937073, "rewards/DrugCombAccuracyCOTORM/mean": 0.40625, "rewards/DrugCombAccuracyCOTORM/std": 0.375, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 480.5, "completions/min_length": 395.0, "epoch": 9.044117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.027740193530917168, "kl": 0.009335680166259408, "learning_rate": 6.673872297377016e-07, "loss": 9.224153473041952e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/mean_length": 488.1875, "completions/min_length": 412.0, "epoch": 9.045588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.022321302443742752, "kl": 0.013559135608375072, "learning_rate": 6.672662962281503e-07, "loss": 0.00013579294318333268, "reward": 0.6713333129882812, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6100000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.40279027819633484, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 6151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 455.5, "completions/min_length": 429.0, "epoch": 9.047058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.013357484713196754, "kl": 0.009054038091562688, "learning_rate": 6.671453516995187e-07, "loss": 8.997347322292626e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 414.625, "completions/min_length": 326.0, "epoch": 9.048529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.301156997680664, "kl": 0.00885453075170517, "learning_rate": 6.670243961597744e-07, "loss": 8.70927469804883e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/mean_length": 523.5625, "completions/min_length": 440.0, "epoch": 9.05, "frac_reward_zero_std": 0.0, "grad_norm": 1.2395234107971191, "kl": 0.009616765659302473, "learning_rate": 6.669034296168854e-07, "loss": 9.616464376449585e-05, "reward": 0.4089166820049286, "reward_std": 0.2538363039493561, "rewards/DrugCombAccuracyCOTORM/mean": 0.29500001668930054, "rewards/DrugCombAccuracyCOTORM/std": 0.30615901947021484, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7291666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.57373046875, "step": 6154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/mean_length": 472.1875, "completions/min_length": 390.0, "epoch": 9.051470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 0.9910933375358582, "kl": 0.01009684253949672, "learning_rate": 6.66782452078821e-07, "loss": 0.00010117061174241826, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 6155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 485.0625, "completions/min_length": 414.0, "epoch": 9.052941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.012289341539144516, "kl": 0.010543656535446644, "learning_rate": 6.666614635535507e-07, "loss": 0.0001050062564900145, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 437.25, "completions/min_length": 340.0, "epoch": 9.054411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.01181651372462511, "kl": 0.008141133701428771, "learning_rate": 6.665404640490448e-07, "loss": 8.140822319546714e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 438.5, "completions/min_length": 361.0, "epoch": 9.055882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.007862734608352184, "kl": 0.006790754850953817, "learning_rate": 6.664194535732748e-07, "loss": 6.787986785639077e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 438.5, "completions/min_length": 381.0, "epoch": 9.05735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.013494635000824928, "kl": 0.008004918578080833, "learning_rate": 6.662984321342122e-07, "loss": 8.044181595323607e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 432.25, "completions/min_length": 361.0, "epoch": 9.058823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.00987248495221138, "kl": 0.00954595033545047, "learning_rate": 6.661773997398297e-07, "loss": 9.495080303167924e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 423.0625, "completions/min_length": 338.0, "epoch": 9.060294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.030920259654521942, "kl": 0.008732986636459827, "learning_rate": 6.66056356398101e-07, "loss": 8.66045884322375e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 477.0, "completions/min_length": 385.0, "epoch": 9.061764705882354, "frac_reward_zero_std": 0.0, "grad_norm": 1.443419337272644, "kl": 0.011263038963079453, "learning_rate": 6.659353021169995e-07, "loss": 0.00011221319437026978, "reward": 0.5249999761581421, "reward_std": 0.4475547671318054, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 6162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 488.625, "completions/min_length": 418.0, "epoch": 9.063235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 0.9189735054969788, "kl": 0.011837755562737584, "learning_rate": 6.658142369045001e-07, "loss": 0.0001171426847577095, "reward": 0.799541711807251, "reward_std": 0.21549713611602783, "rewards/DrugCombAccuracyCOTORM/mean": 0.7637500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.4235701560974121, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8854166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.2770128548145294, "step": 6163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 447.625, "completions/min_length": 367.0, "epoch": 9.064705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.010804861783981323, "kl": 0.008804523735307157, "learning_rate": 6.656931607685786e-07, "loss": 8.817158959573135e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 427.9375, "completions/min_length": 373.0, "epoch": 9.066176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.02407025173306465, "kl": 0.009055833797901869, "learning_rate": 6.655720737172109e-07, "loss": 8.914554200600833e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 442.375, "completions/min_length": 391.0, "epoch": 9.06764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.013628335669636726, "kl": 0.007078406284563243, "learning_rate": 6.65450975758374e-07, "loss": 7.03133555362001e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 474.875, "completions/min_length": 422.0, "epoch": 9.069117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.02139529213309288, "kl": 0.01001242222264409, "learning_rate": 6.653298669000457e-07, "loss": 9.92488203337416e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 6167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 451.8125, "completions/min_length": 395.0, "epoch": 9.070588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.01503405999392271, "kl": 0.012578763416968286, "learning_rate": 6.652087471502039e-07, "loss": 0.00012646625691559166, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 476.4375, "completions/min_length": 427.0, "epoch": 9.072058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.3151254653930664, "kl": 0.01583325583487749, "learning_rate": 6.65087616516828e-07, "loss": 0.00015693902969360352, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 464.0, "completions/min_length": 417.0, "epoch": 9.073529411764707, "frac_reward_zero_std": 0.0, "grad_norm": 1.4272524118423462, "kl": 0.014317211462184787, "learning_rate": 6.649664750078979e-07, "loss": 0.0001446455717086792, "reward": 0.3125, "reward_std": 0.34973087906837463, "rewards/DrugCombAccuracyCOTORM/mean": 0.1875, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 460.375, "completions/min_length": 398.0, "epoch": 9.075, "frac_reward_zero_std": 1.0, "grad_norm": 0.054797958582639694, "kl": 0.011182006797753274, "learning_rate": 6.648453226313936e-07, "loss": 0.00011135460954392329, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 6171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 439.375, "completions/min_length": 389.0, "epoch": 9.076470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.012753987684845924, "kl": 0.008343656314536929, "learning_rate": 6.647241593952968e-07, "loss": 8.347664697794244e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 434.75, "completions/min_length": 351.0, "epoch": 9.077941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.04128018021583557, "kl": 0.009153989842161536, "learning_rate": 6.646029853075893e-07, "loss": 9.221627260558307e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 469.375, "completions/min_length": 436.0, "epoch": 9.079411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.9554725885391235, "kl": 0.010538726579397917, "learning_rate": 6.644818003762535e-07, "loss": 0.00010536798072280362, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 6174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/mean_length": 483.125, "completions/min_length": 390.0, "epoch": 9.080882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.9460264444351196, "kl": 0.009414395317435265, "learning_rate": 6.643606046092731e-07, "loss": 9.53376293182373e-05, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 6175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/mean_length": 503.4375, "completions/min_length": 411.0, "epoch": 9.08235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.344194769859314, "kl": 0.010498485295102, "learning_rate": 6.642393980146319e-07, "loss": 0.00010620057582855225, "reward": 0.7557916641235352, "reward_std": 0.18797163665294647, "rewards/DrugCombAccuracyCOTORM/mean": 0.7038542032241821, "rewards/DrugCombAccuracyCOTORM/std": 0.3600369095802307, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9270833730697632, "rewards/DrugCombCoverageCOTORM/std": 0.10485881567001343, "step": 6176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 448.1875, "completions/min_length": 351.0, "epoch": 9.083823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 1.3313989639282227, "kl": 0.013514337362721562, "learning_rate": 6.641181806003149e-07, "loss": 0.00013580918312072754, "reward": 0.7339166402816772, "reward_std": 0.4204002022743225, "rewards/DrugCombAccuracyCOTORM/mean": 0.7012500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.46046173572540283, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7291666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.6800735592842102, "step": 6177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 409.6875, "completions/min_length": 355.0, "epoch": 9.08529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.012184458784759045, "kl": 0.009425018215551972, "learning_rate": 6.639969523743074e-07, "loss": 9.474616672378033e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 427.75, "completions/min_length": 347.0, "epoch": 9.086764705882352, "frac_reward_zero_std": 1.0, "grad_norm": 0.012710530310869217, "kl": 0.007344188634306192, "learning_rate": 6.638757133445958e-07, "loss": 7.330443622777238e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 499.6875, "completions/min_length": 409.0, "epoch": 9.088235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.3191453218460083, "kl": 0.013471200130879879, "learning_rate": 6.637544635191669e-07, "loss": 0.00013427867088466883, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 6180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 729.0, "completions/mean_length": 510.75, "completions/min_length": 363.0, "epoch": 9.089705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 1.3451703786849976, "kl": 0.011332485359162092, "learning_rate": 6.636332029060083e-07, "loss": 0.00011324509978294373, "reward": 0.7593749761581421, "reward_std": 0.3320940434932709, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.394405335187912, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.59375, "rewards/DrugCombCoverageCOTORM/std": 0.7934191226959229, "step": 6181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 451.8125, "completions/min_length": 367.0, "epoch": 9.091176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.01041371375322342, "kl": 0.007601267192512751, "learning_rate": 6.635119315131084e-07, "loss": 7.55093788029626e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/mean_length": 528.25, "completions/min_length": 441.0, "epoch": 9.092647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.9057222604751587, "kl": 0.010102225933223963, "learning_rate": 6.633906493484563e-07, "loss": 0.00010085964458994567, "reward": 0.2750000059604645, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.25, "rewards/DrugCombCoverageCOTORM/std": 0.8563488721847534, "step": 6183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 417.3125, "completions/min_length": 342.0, "epoch": 9.094117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.019685450941324234, "kl": 0.008185176295228302, "learning_rate": 6.632693564200416e-07, "loss": 8.08221593615599e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 428.125, "completions/min_length": 373.0, "epoch": 9.095588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.01956172101199627, "kl": 0.008561447612009943, "learning_rate": 6.631480527358551e-07, "loss": 8.594141399953514e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/mean_length": 447.8125, "completions/min_length": 374.0, "epoch": 9.097058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.009443876333534718, "kl": 0.009069355786778033, "learning_rate": 6.630267383038875e-07, "loss": 9.103964839596301e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 429.4375, "completions/min_length": 362.0, "epoch": 9.098529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.01574486866593361, "kl": 0.007978372159413993, "learning_rate": 6.629054131321311e-07, "loss": 7.931794971227646e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 426.9375, "completions/min_length": 346.0, "epoch": 9.1, "frac_reward_zero_std": 1.0, "grad_norm": 0.015491364523768425, "kl": 0.013654233189299703, "learning_rate": 6.627840772285783e-07, "loss": 0.00013551130541600287, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 476.0, "completions/min_length": 410.0, "epoch": 9.101470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.6722204685211182, "kl": 0.00973102985881269, "learning_rate": 6.626627306012224e-07, "loss": 9.74312424659729e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 453.8125, "completions/min_length": 401.0, "epoch": 9.102941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 0.9250733256340027, "kl": 0.012881870614364743, "learning_rate": 6.625413732580575e-07, "loss": 0.0001288552739424631, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 6190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 442.625, "completions/min_length": 375.0, "epoch": 9.104411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.012937053106725216, "kl": 0.006958038546144962, "learning_rate": 6.624200052070785e-07, "loss": 6.991974078118801e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 450.5, "completions/min_length": 366.0, "epoch": 9.105882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.063475251197815, "kl": 0.013575433054938912, "learning_rate": 6.622986264562803e-07, "loss": 0.000135861337184906, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 429.5625, "completions/min_length": 330.0, "epoch": 9.10735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0683871507644653, "kl": 0.012926029274240136, "learning_rate": 6.621772370136595e-07, "loss": 0.00013295179815031588, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 6193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 488.9375, "completions/min_length": 383.0, "epoch": 9.108823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 1.9144340753555298, "kl": 0.01304155052639544, "learning_rate": 6.620558368872126e-07, "loss": 0.00013164430856704712, "reward": 0.4345521330833435, "reward_std": 0.1278810203075409, "rewards/DrugCombAccuracyCOTORM/mean": 0.335833340883255, "rewards/DrugCombAccuracyCOTORM/std": 0.21184027194976807, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6588541865348816, "rewards/DrugCombCoverageCOTORM/std": 0.2226829081773758, "step": 6194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 459.0, "completions/min_length": 396.0, "epoch": 9.110294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.834311306476593, "kl": 0.010232226457446814, "learning_rate": 6.619344260849373e-07, "loss": 0.00010228750034002587, "reward": 0.5859999656677246, "reward_std": 0.04615088924765587, "rewards/DrugCombAccuracyCOTORM/mean": 0.5137500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.5050000548362732, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.49441322684288025, "step": 6195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 431.4375, "completions/min_length": 387.0, "epoch": 9.111764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.010874420404434204, "kl": 0.008857688284479082, "learning_rate": 6.618130046148318e-07, "loss": 8.823248208500445e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 431.6875, "completions/min_length": 340.0, "epoch": 9.113235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.022490665316581726, "kl": 0.009135852567851543, "learning_rate": 6.616915724848951e-07, "loss": 9.20331513043493e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 432.1875, "completions/min_length": 381.0, "epoch": 9.114705882352942, "frac_reward_zero_std": 0.0, "grad_norm": 1.3724614381790161, "kl": 0.015007707756012678, "learning_rate": 6.615701297031268e-07, "loss": 0.00015038251876831055, "reward": 0.699999988079071, "reward_std": 0.3484410047531128, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 445.4375, "completions/min_length": 373.0, "epoch": 9.116176470588234, "frac_reward_zero_std": 0.5, "grad_norm": 0.8638330101966858, "kl": 0.012212595669552684, "learning_rate": 6.614486762775273e-07, "loss": 0.00012186843378003687, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 426.3125, "completions/min_length": 365.0, "epoch": 9.117647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.8365640044212341, "kl": 0.010750011075288057, "learning_rate": 6.613272122160974e-07, "loss": 0.00010803164332173765, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/mean_length": 504.25, "completions/min_length": 438.0, "epoch": 9.119117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.9857622981071472, "kl": 0.009697504108771682, "learning_rate": 6.612057375268392e-07, "loss": 9.743869304656982e-05, "reward": 0.65625, "reward_std": 0.12302003800868988, "rewards/DrugCombAccuracyCOTORM/mean": 0.59375, "rewards/DrugCombAccuracyCOTORM/std": 0.4552929699420929, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 6201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 495.125, "completions/min_length": 411.0, "epoch": 9.120588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.01739431917667389, "kl": 0.009432606748305261, "learning_rate": 6.610842522177549e-07, "loss": 9.508905350230634e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 488.6875, "completions/min_length": 421.0, "epoch": 9.12205882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.0070991516113281, "kl": 0.012084246845915914, "learning_rate": 6.609627562968479e-07, "loss": 0.00012194352166261524, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 6203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/mean_length": 390.0, "completions/min_length": 325.0, "epoch": 9.123529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.019885890185832977, "kl": 0.00919620378408581, "learning_rate": 6.608412497721215e-07, "loss": 9.229192801285535e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 446.75, "completions/min_length": 369.0, "epoch": 9.125, "frac_reward_zero_std": 0.5, "grad_norm": 1.0462901592254639, "kl": 0.00991607690230012, "learning_rate": 6.607197326515807e-07, "loss": 9.98377799987793e-05, "reward": 0.942187488079071, "reward_std": 0.16351844370365143, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 783.0, "completions/mean_length": 527.375, "completions/min_length": 423.0, "epoch": 9.126470588235295, "frac_reward_zero_std": 0.0, "grad_norm": 1.3208035230636597, "kl": 0.011472065118141472, "learning_rate": 6.605982049432308e-07, "loss": 0.00010980665683746338, "reward": 0.529619038105011, "reward_std": 0.346235990524292, "rewards/DrugCombAccuracyCOTORM/mean": 0.46291667222976685, "rewards/DrugCombAccuracyCOTORM/std": 0.48721563816070557, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5928571224212646, "rewards/DrugCombCoverageCOTORM/std": 0.7949842810630798, "step": 6206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 470.25, "completions/min_length": 400.0, "epoch": 9.12794117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.0167116466909647, "kl": 0.009208159055560827, "learning_rate": 6.604766666550775e-07, "loss": 9.167865209747106e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 448.75, "completions/min_length": 392.0, "epoch": 9.129411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.01526582520455122, "kl": 0.011155243963003159, "learning_rate": 6.603551177951275e-07, "loss": 0.00011117082613054663, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 438.9375, "completions/min_length": 391.0, "epoch": 9.130882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.014956226572394371, "kl": 0.009597458294592798, "learning_rate": 6.602335583713883e-07, "loss": 9.589356341166422e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 436.1875, "completions/min_length": 408.0, "epoch": 9.132352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.1974976062774658, "kl": 0.010036020539700985, "learning_rate": 6.601119883918676e-07, "loss": 0.00010047317482531071, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 439.75, "completions/min_length": 407.0, "epoch": 9.133823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.009242361411452293, "kl": 0.008388703339733183, "learning_rate": 6.599904078645744e-07, "loss": 8.415785850957036e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 469.1875, "completions/min_length": 406.0, "epoch": 9.135294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.025688141584396362, "kl": 0.009490007301792502, "learning_rate": 6.598688167975179e-07, "loss": 9.489145304542035e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/mean_length": 403.75, "completions/min_length": 374.0, "epoch": 9.136764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.010818003676831722, "kl": 0.0069646339397877455, "learning_rate": 6.597472151987085e-07, "loss": 6.977484736125916e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 461.6875, "completions/min_length": 423.0, "epoch": 9.138235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.08880790323019028, "kl": 0.012738365214318037, "learning_rate": 6.596256030761566e-07, "loss": 0.00012655757018364966, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 425.875, "completions/min_length": 361.0, "epoch": 9.139705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.01121458038687706, "kl": 0.007962346659041941, "learning_rate": 6.595039804378741e-07, "loss": 7.91559141362086e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 428.125, "completions/min_length": 373.0, "epoch": 9.141176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.1133517026901245, "kl": 0.010496828239411116, "learning_rate": 6.593823472918731e-07, "loss": 0.00010482206562301144, "reward": 0.6213333010673523, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.5475000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.41562002897262573, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 6216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 513.25, "completions/min_length": 458.0, "epoch": 9.14264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.013806400820612907, "kl": 0.008584498194977641, "learning_rate": 6.592607036461662e-07, "loss": 8.533993968740106e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 417.0, "completions/min_length": 359.0, "epoch": 9.144117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.032368600368499756, "kl": 0.010227207327261567, "learning_rate": 6.591390495087673e-07, "loss": 0.00010210822074441239, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 6218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 437.875, "completions/min_length": 388.0, "epoch": 9.145588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.009498275816440582, "kl": 0.0062843800988048315, "learning_rate": 6.590173848876905e-07, "loss": 6.280025991145521e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 415.0625, "completions/min_length": 369.0, "epoch": 9.147058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.03022720292210579, "kl": 0.010482188314199448, "learning_rate": 6.588957097909507e-07, "loss": 0.00010436483717057854, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 441.125, "completions/min_length": 401.0, "epoch": 9.148529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.7403293251991272, "kl": 0.008536106091924012, "learning_rate": 6.58774024226564e-07, "loss": 8.545204036636278e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 450.8125, "completions/min_length": 406.0, "epoch": 9.15, "frac_reward_zero_std": 1.0, "grad_norm": 0.01784064620733261, "kl": 0.008939597988501191, "learning_rate": 6.586523282025461e-07, "loss": 8.986805187305436e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 448.1875, "completions/min_length": 407.0, "epoch": 9.151470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.9976919293403625, "kl": 0.008402184001170099, "learning_rate": 6.585306217269144e-07, "loss": 8.330866694450378e-05, "reward": 0.9708333015441895, "reward_std": 0.05473601073026657, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 456.375, "completions/min_length": 390.0, "epoch": 9.152941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.0899934768676758, "kl": 0.018053694046102464, "learning_rate": 6.584089048076865e-07, "loss": 0.00019336114928591996, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 505.75, "completions/min_length": 425.0, "epoch": 9.154411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.215275526046753, "kl": 0.012121196370571852, "learning_rate": 6.582871774528809e-07, "loss": 0.00012193889415357262, "reward": 0.831250011920929, "reward_std": 0.19944117963314056, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.35939764976501465, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.35939764976501465, "step": 6225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/mean_length": 483.8125, "completions/min_length": 440.0, "epoch": 9.155882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.030321478843689, "kl": 0.009363978053443134, "learning_rate": 6.581654396705166e-07, "loss": 9.427964687347412e-05, "reward": 0.528499960899353, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.45750001072883606, "rewards/DrugCombAccuracyCOTORM/std": 0.4373328685760498, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 6226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/mean_length": 482.5625, "completions/min_length": 358.0, "epoch": 9.157352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.9442510008811951, "kl": 0.009710682090371847, "learning_rate": 6.580436914686133e-07, "loss": 9.62281774263829e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/mean_length": 429.625, "completions/min_length": 404.0, "epoch": 9.158823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.013291756622493267, "kl": 0.010562870185822248, "learning_rate": 6.579219328551917e-07, "loss": 0.00010566077980911359, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 458.625, "completions/min_length": 416.0, "epoch": 9.160294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.031504642218351364, "kl": 0.009033913258463144, "learning_rate": 6.578001638382728e-07, "loss": 8.918395906221122e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 473.5625, "completions/min_length": 397.0, "epoch": 9.161764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.0163142681121826, "kl": 0.009728195844218135, "learning_rate": 6.576783844258783e-07, "loss": 9.706616401672363e-05, "reward": 0.8999999761581421, "reward_std": 0.2828426957130432, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 435.1875, "completions/min_length": 363.0, "epoch": 9.163235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.008971920236945152, "kl": 0.006637980113737285, "learning_rate": 6.57556594626031e-07, "loss": 6.616151222260669e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 462.875, "completions/min_length": 386.0, "epoch": 9.16470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.024403298273682594, "kl": 0.01087455777451396, "learning_rate": 6.57434794446754e-07, "loss": 0.00010854328866116703, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 444.1875, "completions/min_length": 357.0, "epoch": 9.166176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9080299139022827, "kl": 0.010577139910310507, "learning_rate": 6.573129838960712e-07, "loss": 0.00010679662227630615, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 445.9375, "completions/min_length": 382.0, "epoch": 9.16764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.02176915481686592, "kl": 0.009601383469998837, "learning_rate": 6.57191162982007e-07, "loss": 9.626885002944618e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/mean_length": 496.8125, "completions/min_length": 399.0, "epoch": 9.169117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.8664158582687378, "kl": 0.008088146802037954, "learning_rate": 6.570693317125867e-07, "loss": 7.99819827079773e-05, "reward": 0.7420833110809326, "reward_std": 0.15859439969062805, "rewards/DrugCombAccuracyCOTORM/mean": 0.706250011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.40078049898147583, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7708333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.3154948949813843, "step": 6235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/mean_length": 492.75, "completions/min_length": 380.0, "epoch": 9.170588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.0343810319900513, "kl": 0.008159477729350328, "learning_rate": 6.569474900958365e-07, "loss": 8.189026266336441e-05, "reward": 0.45000001788139343, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 6236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 456.6875, "completions/min_length": 403.0, "epoch": 9.172058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.0876539945602417, "kl": 0.013539029052481055, "learning_rate": 6.568256381397827e-07, "loss": 0.00013519078493118286, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 6237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 475.3125, "completions/min_length": 383.0, "epoch": 9.173529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.0492393970489502, "kl": 0.01033644424751401, "learning_rate": 6.567037758524528e-07, "loss": 0.00010295565880369395, "reward": 0.9104166626930237, "reward_std": 0.09705483913421631, "rewards/DrugCombAccuracyCOTORM/mean": 0.8958333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.1912434846162796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.17078252136707306, "step": 6238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 450.0625, "completions/min_length": 412.0, "epoch": 9.175, "frac_reward_zero_std": 0.5, "grad_norm": 0.8729075789451599, "kl": 0.009485023678280413, "learning_rate": 6.565819032418747e-07, "loss": 9.481608867645264e-05, "reward": 0.6875, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 6239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 481.75, "completions/min_length": 404.0, "epoch": 9.176470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.04072976112365723, "kl": 0.011620173696428537, "learning_rate": 6.564600203160771e-07, "loss": 0.00011790009739343077, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 512.1875, "completions/min_length": 421.0, "epoch": 9.177941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.7527668476104736, "kl": 0.007952423999086022, "learning_rate": 6.563381270830894e-07, "loss": 7.983297109603882e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 420.0625, "completions/min_length": 370.0, "epoch": 9.179411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.1408987045288086, "kl": 0.013760727131739259, "learning_rate": 6.562162235509414e-07, "loss": 0.00013513490557670593, "reward": 0.4312500059604645, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 6242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 461.4375, "completions/min_length": 389.0, "epoch": 9.180882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.866032063961029, "kl": 0.009918246301822364, "learning_rate": 6.56094309727664e-07, "loss": 9.892880916595459e-05, "reward": 0.518750011920929, "reward_std": 0.025877458974719048, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.1875, "rewards/DrugCombCoverageCOTORM/std": 0.9105859398841858, "step": 6243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/mean_length": 493.875, "completions/min_length": 385.0, "epoch": 9.18235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9457511305809021, "kl": 0.010819529998116195, "learning_rate": 6.559723856212884e-07, "loss": 0.00010801109601743519, "reward": 0.8806250095367432, "reward_std": 0.14885523915290833, "rewards/DrugCombAccuracyCOTORM/mean": 0.862500011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.2671869993209839, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.25069350004196167, "step": 6244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 454.6875, "completions/min_length": 360.0, "epoch": 9.183823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.01683979295194149, "kl": 0.01012604555580765, "learning_rate": 6.558504512398468e-07, "loss": 0.00010147903230972588, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 482.0, "completions/min_length": 390.0, "epoch": 9.185294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.8192130923271179, "kl": 0.008061834261752665, "learning_rate": 6.55728506591372e-07, "loss": 8.093193173408508e-05, "reward": 0.8767499923706055, "reward_std": 0.17010116577148438, "rewards/DrugCombAccuracyCOTORM/mean": 0.8537499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.31442803144454956, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.13437095284461975, "step": 6246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 473.4375, "completions/min_length": 402.0, "epoch": 9.186764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 1.0043590068817139, "kl": 0.014674989040941, "learning_rate": 6.556065516838971e-07, "loss": 0.0001444434601580724, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 434.75, "completions/min_length": 375.0, "epoch": 9.188235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.011918006464838982, "kl": 0.008348078583367169, "learning_rate": 6.554845865254565e-07, "loss": 8.331186108989641e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 425.875, "completions/min_length": 371.0, "epoch": 9.189705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.03479628637433052, "kl": 0.009208033443428576, "learning_rate": 6.553626111240848e-07, "loss": 9.167238022200763e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 6249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/mean_length": 537.9375, "completions/min_length": 455.0, "epoch": 9.191176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.1938430070877075, "kl": 0.010563439573161304, "learning_rate": 6.552406254878175e-07, "loss": 0.00010562688112258911, "reward": 0.8541666269302368, "reward_std": 0.2003469169139862, "rewards/DrugCombAccuracyCOTORM/mean": 0.8333333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.2687419056892395, "step": 6250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 436.375, "completions/min_length": 390.0, "epoch": 9.19264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.011506162583827972, "kl": 0.0077974689193069935, "learning_rate": 6.551186296246905e-07, "loss": 7.825227658031508e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 428.5, "completions/min_length": 387.0, "epoch": 9.194117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.009595941752195358, "kl": 0.007584896753542125, "learning_rate": 6.549966235427409e-07, "loss": 7.579791417811066e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 460.9375, "completions/min_length": 398.0, "epoch": 9.195588235294117, "frac_reward_zero_std": 0.0, "grad_norm": 1.032127857208252, "kl": 0.011742521310225129, "learning_rate": 6.548746072500059e-07, "loss": 0.00011581182479858398, "reward": 0.7080000042915344, "reward_std": 0.3673255443572998, "rewards/DrugCombAccuracyCOTORM/mean": 0.6662499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.4477108418941498, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 6253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/mean_length": 491.0625, "completions/min_length": 435.0, "epoch": 9.197058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.4706181287765503, "kl": 0.011489694705232978, "learning_rate": 6.547525807545237e-07, "loss": 0.00011483579874038696, "reward": 0.5867499709129333, "reward_std": 0.18259836733341217, "rewards/DrugCombAccuracyCOTORM/mean": 0.5016666650772095, "rewards/DrugCombAccuracyCOTORM/std": 0.4610423147678375, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.17078250646591187, "step": 6254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 464.125, "completions/min_length": 423.0, "epoch": 9.198529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 0.7871561646461487, "kl": 0.011746216681785882, "learning_rate": 6.546305440643332e-07, "loss": 0.00011828655260615051, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 458.0625, "completions/min_length": 401.0, "epoch": 9.2, "frac_reward_zero_std": 1.0, "grad_norm": 0.009304994717240334, "kl": 0.008297884021885693, "learning_rate": 6.545084971874736e-07, "loss": 8.252948464360088e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 421.6875, "completions/min_length": 362.0, "epoch": 9.201470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.008665728382766247, "kl": 0.006646035122685134, "learning_rate": 6.543864401319854e-07, "loss": 6.679767102468759e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/mean_length": 450.3125, "completions/min_length": 291.0, "epoch": 9.202941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.5646260976791382, "kl": 0.007028421969152987, "learning_rate": 6.542643729059092e-07, "loss": 6.893619138281792e-05, "reward": 0.996874988079071, "reward_std": 0.008838826790452003, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 6258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 439.25, "completions/min_length": 400.0, "epoch": 9.204411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.008339492604136467, "kl": 0.008481891709379852, "learning_rate": 6.541422955172864e-07, "loss": 8.423048711847514e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 418.5625, "completions/min_length": 392.0, "epoch": 9.205882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.013343029655516148, "kl": 0.009309232700616121, "learning_rate": 6.540202079741594e-07, "loss": 9.295708150602877e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 441.875, "completions/min_length": 390.0, "epoch": 9.20735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.006968916393816471, "kl": 0.0054060862166807055, "learning_rate": 6.538981102845709e-07, "loss": 5.396000051405281e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 431.25, "completions/min_length": 378.0, "epoch": 9.208823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.03330463171005249, "kl": 0.009274318697862327, "learning_rate": 6.537760024565642e-07, "loss": 9.336236689705402e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/mean_length": 569.9375, "completions/min_length": 507.0, "epoch": 9.21029411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8406659960746765, "kl": 0.010629083262756467, "learning_rate": 6.536538844981836e-07, "loss": 0.0001061290895449929, "reward": 0.7000000476837158, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 421.5625, "completions/min_length": 380.0, "epoch": 9.211764705882352, "frac_reward_zero_std": 1.0, "grad_norm": 0.013550621457397938, "kl": 0.008215227629989386, "learning_rate": 6.53531756417474e-07, "loss": 8.203543984564021e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 435.625, "completions/min_length": 397.0, "epoch": 9.213235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.09933236241340637, "kl": 0.012748236767947674, "learning_rate": 6.534096182224808e-07, "loss": 0.0001287369232159108, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 456.5625, "completions/min_length": 388.0, "epoch": 9.214705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.014224458485841751, "kl": 0.008938082261011004, "learning_rate": 6.532874699212503e-07, "loss": 8.880545647116378e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 6266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 462.4375, "completions/min_length": 384.0, "epoch": 9.216176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.013253103010356426, "kl": 0.008738846867345273, "learning_rate": 6.531653115218292e-07, "loss": 8.783850353211164e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 450.8125, "completions/min_length": 356.0, "epoch": 9.217647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.08067326992750168, "kl": 0.01071799488272518, "learning_rate": 6.530431430322649e-07, "loss": 0.00010622791160130873, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/mean_length": 424.3125, "completions/min_length": 385.0, "epoch": 9.219117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.013353222981095314, "kl": 0.009730248479172587, "learning_rate": 6.529209644606057e-07, "loss": 9.709228470455855e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 449.25, "completions/min_length": 415.0, "epoch": 9.220588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.02338942140340805, "kl": 0.010100544895976782, "learning_rate": 6.527987758149002e-07, "loss": 0.00010038772597908974, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 454.25, "completions/min_length": 425.0, "epoch": 9.222058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.02377774380147457, "kl": 0.010500916978344321, "learning_rate": 6.526765771031982e-07, "loss": 0.00010511405707802624, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 438.125, "completions/min_length": 339.0, "epoch": 9.223529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.7558261156082153, "kl": 0.009866302134469151, "learning_rate": 6.525543683335496e-07, "loss": 9.821738058235496e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 451.0625, "completions/min_length": 404.0, "epoch": 9.225, "frac_reward_zero_std": 0.0, "grad_norm": 1.528731107711792, "kl": 0.016151691903360188, "learning_rate": 6.524321495140054e-07, "loss": 0.00015968084335327148, "reward": 0.7796875238418579, "reward_std": 0.3765065670013428, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 6273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 458.375, "completions/min_length": 368.0, "epoch": 9.226470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.017718752846121788, "kl": 0.01099293609149754, "learning_rate": 6.523099206526168e-07, "loss": 0.00010893019498325884, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 487.875, "completions/min_length": 427.0, "epoch": 9.227941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.010907516814768314, "kl": 0.009367730701342225, "learning_rate": 6.521876817574362e-07, "loss": 9.340145334135741e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/mean_length": 490.0625, "completions/min_length": 426.0, "epoch": 9.229411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.3412014245986938, "kl": 0.015755717176944017, "learning_rate": 6.520654328365161e-07, "loss": 0.0001574307680130005, "reward": 0.6035000085830688, "reward_std": 0.3591628074645996, "rewards/DrugCombAccuracyCOTORM/mean": 0.5199999809265137, "rewards/DrugCombAccuracyCOTORM/std": 0.4445222318172455, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.1666666567325592, "step": 6276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/mean_length": 482.6875, "completions/min_length": 326.0, "epoch": 9.230882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.0635812282562256, "kl": 0.014171891147270799, "learning_rate": 6.519431738979102e-07, "loss": 0.00014221668243408203, "reward": 0.8950647115707397, "reward_std": 0.07491926103830338, "rewards/DrugCombAccuracyCOTORM/mean": 0.8707840442657471, "rewards/DrugCombAccuracyCOTORM/std": 0.18507762253284454, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.03359273448586464, "step": 6277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/mean_length": 495.125, "completions/min_length": 427.0, "epoch": 9.23235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.3055140972137451, "kl": 0.010970079572871327, "learning_rate": 6.518209049496726e-07, "loss": 0.00010849535465240479, "reward": 0.5375000238418579, "reward_std": 0.34973087906837463, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 6278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 481.6875, "completions/min_length": 404.0, "epoch": 9.233823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.9534355998039246, "kl": 0.01521438849158585, "learning_rate": 6.516986259998581e-07, "loss": 0.00015413451183121651, "reward": 0.6301249861717224, "reward_std": 0.1829991638660431, "rewards/DrugCombAccuracyCOTORM/mean": 0.6040624976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.4788152873516083, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.46875, "rewards/DrugCombCoverageCOTORM/std": 0.8844725489616394, "step": 6279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 503.75, "completions/min_length": 460.0, "epoch": 9.235294117647058, "frac_reward_zero_std": 0.0, "grad_norm": 1.3413234949111938, "kl": 0.012228923616930842, "learning_rate": 6.515763370565217e-07, "loss": 0.0001224726438522339, "reward": 0.7102500200271606, "reward_std": 0.3807827830314636, "rewards/DrugCombAccuracyCOTORM/mean": 0.6456249952316284, "rewards/DrugCombAccuracyCOTORM/std": 0.47547829151153564, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.18130187690258026, "step": 6280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/mean_length": 389.9375, "completions/min_length": 370.0, "epoch": 9.236764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.041618261486291885, "kl": 0.008050492382608354, "learning_rate": 6.5145403812772e-07, "loss": 7.968907448230311e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/mean_length": 511.375, "completions/min_length": 402.0, "epoch": 9.238235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1384352445602417, "kl": 0.007738804328255355, "learning_rate": 6.513317292215095e-07, "loss": 7.774990808684379e-05, "reward": 0.9444681406021118, "reward_std": 0.07894683629274368, "rewards/DrugCombAccuracyCOTORM/mean": 0.9352726936340332, "rewards/DrugCombAccuracyCOTORM/std": 0.14486409723758698, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9624999761581421, "rewards/DrugCombCoverageCOTORM/std": 0.10246951878070831, "step": 6282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 461.75, "completions/min_length": 398.0, "epoch": 9.239705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.02771097421646118, "kl": 0.011405907338485122, "learning_rate": 6.512094103459477e-07, "loss": 0.0001146771464846097, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/mean_length": 492.4375, "completions/min_length": 379.0, "epoch": 9.241176470588234, "frac_reward_zero_std": 0.5, "grad_norm": 0.9563789367675781, "kl": 0.01467172265984118, "learning_rate": 6.510870815090926e-07, "loss": 0.0001474093005526811, "reward": 0.640529990196228, "reward_std": 0.0514504499733448, "rewards/DrugCombAccuracyCOTORM/mean": 0.5615999698638916, "rewards/DrugCombAccuracyCOTORM/std": 0.46163037419319153, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9125000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.10246950387954712, "step": 6284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 445.9375, "completions/min_length": 401.0, "epoch": 9.242647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.012850762344896793, "kl": 0.010187507723458111, "learning_rate": 6.509647427190029e-07, "loss": 0.00010200295218965039, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 446.6875, "completions/min_length": 387.0, "epoch": 9.244117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.9694263339042664, "kl": 0.014539926312863827, "learning_rate": 6.508423939837379e-07, "loss": 0.0001459617487853393, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 429.6875, "completions/min_length": 359.0, "epoch": 9.245588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.9120655059814453, "kl": 0.013029409106820822, "learning_rate": 6.507200353113578e-07, "loss": 0.00013027340173721313, "reward": 0.5625, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 6287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 428.4375, "completions/min_length": 378.0, "epoch": 9.24705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.018604397773742676, "kl": 0.011088072787970304, "learning_rate": 6.505976667099233e-07, "loss": 0.00011115612142020836, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 472.375, "completions/min_length": 429.0, "epoch": 9.248529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.9714455008506775, "kl": 0.009658310213126242, "learning_rate": 6.504752881874954e-07, "loss": 9.65222716331482e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 440.5625, "completions/min_length": 409.0, "epoch": 9.25, "frac_reward_zero_std": 0.5, "grad_norm": 0.9971301555633545, "kl": 0.011401823721826077, "learning_rate": 6.503528997521364e-07, "loss": 0.00011526554590091109, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 459.375, "completions/min_length": 385.0, "epoch": 9.251470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.020774612203240395, "kl": 0.015000044601038098, "learning_rate": 6.502305014119092e-07, "loss": 0.00014976586680859327, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 463.1875, "completions/min_length": 363.0, "epoch": 9.25294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.0095593929290771, "kl": 0.00896727410145104, "learning_rate": 6.501080931748764e-07, "loss": 8.94241820788011e-05, "reward": 0.25, "reward_std": 0.2507132589817047, "rewards/DrugCombAccuracyCOTORM/mean": 0.1875, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 0.730296790599823, "step": 6292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/mean_length": 516.25, "completions/min_length": 399.0, "epoch": 9.254411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.7485787272453308, "kl": 0.010280620655976236, "learning_rate": 6.499856750491023e-07, "loss": 0.0001021847128868103, "reward": 0.5485458374023438, "reward_std": 0.11131175607442856, "rewards/DrugCombAccuracyCOTORM/mean": 0.4376354217529297, "rewards/DrugCombAccuracyCOTORM/std": 0.48760515451431274, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.0625, "step": 6293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 496.625, "completions/min_length": 434.0, "epoch": 9.255882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.486763834953308, "kl": 0.009979161899536848, "learning_rate": 6.498632470426515e-07, "loss": 9.929198131430894e-05, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 439.25, "completions/min_length": 404.0, "epoch": 9.257352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.022547580301761627, "kl": 0.010221259202808142, "learning_rate": 6.497408091635893e-07, "loss": 0.00010256578389089555, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 448.375, "completions/min_length": 379.0, "epoch": 9.258823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 1.206336259841919, "kl": 0.0142946548294276, "learning_rate": 6.496183614199816e-07, "loss": 0.00014253088738769293, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 471.5625, "completions/min_length": 435.0, "epoch": 9.260294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.8579838871955872, "kl": 0.010588787728920579, "learning_rate": 6.494959038198948e-07, "loss": 0.00010578334331512451, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 481.125, "completions/min_length": 378.0, "epoch": 9.261764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.016556087881326675, "kl": 0.01061009126715362, "learning_rate": 6.493734363713963e-07, "loss": 0.00010581151582300663, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 485.4375, "completions/min_length": 412.0, "epoch": 9.263235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9920756220817566, "kl": 0.011638895375654101, "learning_rate": 6.492509590825537e-07, "loss": 0.00011601480946410447, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 6299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 420.75, "completions/min_length": 380.0, "epoch": 9.264705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.009239875711500645, "kl": 0.00693139957729727, "learning_rate": 6.491284719614357e-07, "loss": 6.917495920788497e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 429.4375, "completions/min_length": 371.0, "epoch": 9.266176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.0745713710784912, "kl": 0.010514345718547702, "learning_rate": 6.490059750161113e-07, "loss": 0.00010437340097269043, "reward": 0.843250036239624, "reward_std": 0.16757279634475708, "rewards/DrugCombAccuracyCOTORM/mean": 0.8118749856948853, "rewards/DrugCombAccuracyCOTORM/std": 0.3365282416343689, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.11180340498685837, "step": 6301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 461.4375, "completions/min_length": 407.0, "epoch": 9.26764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0726195573806763, "kl": 0.00890011410228908, "learning_rate": 6.488834682546503e-07, "loss": 8.896580402506515e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 428.0625, "completions/min_length": 373.0, "epoch": 9.269117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.016274068504571915, "kl": 0.007812180672772229, "learning_rate": 6.487609516851234e-07, "loss": 7.785535126458853e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 446.875, "completions/min_length": 363.0, "epoch": 9.270588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.01580054499208927, "kl": 0.008964109933003783, "learning_rate": 6.486384253156013e-07, "loss": 8.991068898467347e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 445.5625, "completions/min_length": 399.0, "epoch": 9.272058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.01873181201517582, "kl": 0.009536455618217587, "learning_rate": 6.48515889154156e-07, "loss": 9.484057227382436e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 491.6875, "completions/min_length": 459.0, "epoch": 9.273529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.010835723951458931, "kl": 0.008069724892266095, "learning_rate": 6.483933432088596e-07, "loss": 8.068753231782466e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 420.4375, "completions/min_length": 339.0, "epoch": 9.275, "frac_reward_zero_std": 0.5, "grad_norm": 1.238010287284851, "kl": 0.011888181092217565, "learning_rate": 6.482707874877854e-07, "loss": 0.00011797412298619747, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/mean_length": 499.125, "completions/min_length": 427.0, "epoch": 9.276470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.8481563925743103, "kl": 0.010725204017944634, "learning_rate": 6.48148221999007e-07, "loss": 0.00010702759027481079, "reward": 0.6495000123977661, "reward_std": 0.1534648984670639, "rewards/DrugCombAccuracyCOTORM/mean": 0.5879166722297668, "rewards/DrugCombAccuracyCOTORM/std": 0.49227645993232727, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.3053029179573059, "step": 6308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 448.8125, "completions/min_length": 388.0, "epoch": 9.277941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.0088571310043335, "kl": 0.01391076622530818, "learning_rate": 6.480256467505987e-07, "loss": 0.0001409575343132019, "reward": 0.9089166522026062, "reward_std": 0.16972768306732178, "rewards/DrugCombAccuracyCOTORM/mean": 0.8887500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.30663496255874634, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 6309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 463.0, "completions/min_length": 362.0, "epoch": 9.279411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.8138737082481384, "kl": 0.007250996306538582, "learning_rate": 6.479030617506353e-07, "loss": 7.241964340209961e-05, "reward": 0.5553571581840515, "reward_std": 0.15657366812229156, "rewards/DrugCombAccuracyCOTORM/mean": 0.5535714030265808, "rewards/DrugCombAccuracyCOTORM/std": 0.5054128766059875, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 6310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 470.875, "completions/min_length": 421.0, "epoch": 9.280882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.021936058998108, "kl": 0.007761215325444937, "learning_rate": 6.477804670071923e-07, "loss": 7.755555270705372e-05, "reward": 0.8385416865348816, "reward_std": 0.15436361730098724, "rewards/DrugCombAccuracyCOTORM/mean": 0.8333333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.27216553688049316, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.71875, "rewards/DrugCombCoverageCOTORM/std": 0.5153881907463074, "step": 6311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/mean_length": 406.375, "completions/min_length": 333.0, "epoch": 9.282352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.9856961965560913, "kl": 0.01066807471215725, "learning_rate": 6.476578625283464e-07, "loss": 0.00010682239371817559, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 449.3125, "completions/min_length": 407.0, "epoch": 9.283823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.01801997236907482, "kl": 0.007582292892038822, "learning_rate": 6.475352483221741e-07, "loss": 7.567180728074163e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/mean_length": 398.75, "completions/min_length": 364.0, "epoch": 9.285294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.010766854509711266, "kl": 0.008398196077905595, "learning_rate": 6.47412624396753e-07, "loss": 8.416366472374648e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/mean_length": 549.9375, "completions/min_length": 423.0, "epoch": 9.286764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0898000001907349, "kl": 0.011839405400678515, "learning_rate": 6.472899907601612e-07, "loss": 0.00011875528434757143, "reward": 0.3757129907608032, "reward_std": 0.12944859266281128, "rewards/DrugCombAccuracyCOTORM/mean": 0.23830443620681763, "rewards/DrugCombAccuracyCOTORM/std": 0.3297206163406372, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8506944179534912, "rewards/DrugCombCoverageCOTORM/std": 0.2719528079032898, "step": 6315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 489.25, "completions/min_length": 462.0, "epoch": 9.288235294117648, "frac_reward_zero_std": 0.5, "grad_norm": 0.8179657459259033, "kl": 0.00928128813393414, "learning_rate": 6.471673474204776e-07, "loss": 9.292678441852331e-05, "reward": 0.8356666564941406, "reward_std": 0.17567972838878632, "rewards/DrugCombAccuracyCOTORM/mean": 0.8050000071525574, "rewards/DrugCombAccuracyCOTORM/std": 0.3488266170024872, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.14907118678092957, "step": 6316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 440.125, "completions/min_length": 402.0, "epoch": 9.28970588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.0096873939037323, "kl": 0.007941175950691104, "learning_rate": 6.470446943857817e-07, "loss": 7.963881944306195e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 439.125, "completions/min_length": 379.0, "epoch": 9.291176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9459534883499146, "kl": 0.010246844962239265, "learning_rate": 6.469220316641532e-07, "loss": 0.00010231882333755493, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 410.6875, "completions/min_length": 373.0, "epoch": 9.29264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.016371959820389748, "kl": 0.009619012707844377, "learning_rate": 6.467993592636732e-07, "loss": 9.634851448936388e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/mean_length": 473.375, "completions/min_length": 394.0, "epoch": 9.294117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.8556996583938599, "kl": 0.010208573658019304, "learning_rate": 6.46676677192423e-07, "loss": 0.0001015886664390564, "reward": 0.8656041622161865, "reward_std": 0.09123873710632324, "rewards/DrugCombAccuracyCOTORM/mean": 0.8671615123748779, "rewards/DrugCombAccuracyCOTORM/std": 0.17956797778606415, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.71875, "rewards/DrugCombCoverageCOTORM/std": 0.6741764545440674, "step": 6320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 454.125, "completions/min_length": 383.0, "epoch": 9.295588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.035090684890747, "kl": 0.012087741517461836, "learning_rate": 6.465539854584845e-07, "loss": 0.00011942462879233062, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 473.4375, "completions/min_length": 409.0, "epoch": 9.297058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.9961434602737427, "kl": 0.011687124380841851, "learning_rate": 6.464312840699402e-07, "loss": 0.00011611002264544368, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 6322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 426.375, "completions/min_length": 365.0, "epoch": 9.298529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.051820509135723114, "kl": 0.007853286224417388, "learning_rate": 6.463085730348737e-07, "loss": 7.910155545687303e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 418.1875, "completions/min_length": 337.0, "epoch": 9.3, "frac_reward_zero_std": 0.5, "grad_norm": 0.7642820477485657, "kl": 0.013104586629197001, "learning_rate": 6.461858523613684e-07, "loss": 0.0001318975118920207, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 463.75, "completions/min_length": 393.0, "epoch": 9.301470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.007233354728668928, "kl": 0.006459818920120597, "learning_rate": 6.460631220575091e-07, "loss": 6.476775888586417e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 438.25, "completions/min_length": 393.0, "epoch": 9.302941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.293027639389038, "kl": 0.010623424779623747, "learning_rate": 6.459403821313811e-07, "loss": 0.00010647624731063843, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 445.1875, "completions/min_length": 401.0, "epoch": 9.304411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.0250669717788696, "kl": 0.010511086089536548, "learning_rate": 6.4581763259107e-07, "loss": 0.00010510534048080444, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 436.0625, "completions/min_length": 393.0, "epoch": 9.305882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.011594587936997414, "kl": 0.008089193026535213, "learning_rate": 6.456948734446624e-07, "loss": 8.12297803349793e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 457.1875, "completions/min_length": 380.0, "epoch": 9.30735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.014675645157694817, "kl": 0.010487939696758986, "learning_rate": 6.45572104700245e-07, "loss": 0.00010478555486770347, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 468.75, "completions/min_length": 396.0, "epoch": 9.308823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.8593102693557739, "kl": 0.009468718315474689, "learning_rate": 6.454493263659059e-07, "loss": 9.486079216003418e-05, "reward": 0.8520833253860474, "reward_std": 0.0652756616473198, "rewards/DrugCombAccuracyCOTORM/mean": 0.8541666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.17078250646591187, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 6330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 445.0625, "completions/min_length": 394.0, "epoch": 9.310294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.81020587682724, "kl": 0.007768108509480953, "learning_rate": 6.453265384497333e-07, "loss": 7.716172694927081e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 470.375, "completions/min_length": 428.0, "epoch": 9.311764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 0.8936574459075928, "kl": 0.011803242727182806, "learning_rate": 6.45203740959816e-07, "loss": 0.00011696323053911328, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 495.5625, "completions/min_length": 449.0, "epoch": 9.313235294117646, "frac_reward_zero_std": 0.0, "grad_norm": 1.3378677368164062, "kl": 0.012587206903845072, "learning_rate": 6.450809339042439e-07, "loss": 0.0001258552074432373, "reward": 0.37316665053367615, "reward_std": 0.33433547616004944, "rewards/DrugCombAccuracyCOTORM/mean": 0.28416669368743896, "rewards/DrugCombAccuracyCOTORM/std": 0.39547720551490784, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5392038226127625, "step": 6333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/mean_length": 480.4375, "completions/min_length": 419.0, "epoch": 9.314705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 1.5732430219650269, "kl": 0.011848025023937225, "learning_rate": 6.449581172911069e-07, "loss": 0.00011865794658660889, "reward": 0.762499988079071, "reward_std": 0.4001959264278412, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 6334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 443.0, "completions/min_length": 376.0, "epoch": 9.316176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.3587191104888916, "kl": 0.01281343586742878, "learning_rate": 6.448352911284959e-07, "loss": 0.00012900680303573608, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/mean_length": 408.0, "completions/min_length": 364.0, "epoch": 9.31764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.013308394700288773, "kl": 0.008290371508337557, "learning_rate": 6.447124554245024e-07, "loss": 8.285044168587774e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 413.625, "completions/min_length": 319.0, "epoch": 9.319117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.041930507868528366, "kl": 0.011029050219804049, "learning_rate": 6.445896101872188e-07, "loss": 0.00010828554513864219, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/mean_length": 485.8125, "completions/min_length": 427.0, "epoch": 9.320588235294117, "frac_reward_zero_std": 0.0, "grad_norm": 1.7766690254211426, "kl": 0.014185710344463587, "learning_rate": 6.444667554247375e-07, "loss": 0.0001426488161087036, "reward": 0.6000000238418579, "reward_std": 0.4823840856552124, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 6338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 465.25, "completions/min_length": 386.0, "epoch": 9.322058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.04436728358268738, "kl": 0.011708781123161316, "learning_rate": 6.44343891145152e-07, "loss": 0.00011587594781303778, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 420.9375, "completions/min_length": 385.0, "epoch": 9.323529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 0.9967619776725769, "kl": 0.012103817309252918, "learning_rate": 6.442210173565561e-07, "loss": 0.00012088567018508911, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 475.0, "completions/min_length": 402.0, "epoch": 9.325, "frac_reward_zero_std": 0.5, "grad_norm": 1.1966067552566528, "kl": 0.010850290767848492, "learning_rate": 6.440981340670446e-07, "loss": 0.00010910660785157233, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 463.8125, "completions/min_length": 422.0, "epoch": 9.326470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.012819224037230015, "kl": 0.01157886371947825, "learning_rate": 6.439752412847128e-07, "loss": 0.00011546241876203567, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 492.125, "completions/min_length": 407.0, "epoch": 9.327941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.3280715942382812, "kl": 0.010050452081486583, "learning_rate": 6.438523390176563e-07, "loss": 0.00010117888450622559, "reward": 0.5013166666030884, "reward_std": 0.41942736506462097, "rewards/DrugCombAccuracyCOTORM/mean": 0.4011250138282776, "rewards/DrugCombAccuracyCOTORM/std": 0.4839547872543335, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8041666746139526, "rewards/DrugCombCoverageCOTORM/std": 0.5044799447059631, "step": 6343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 427.0, "completions/min_length": 380.0, "epoch": 9.329411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.028714606538414955, "kl": 0.01389388320967555, "learning_rate": 6.437294272739718e-07, "loss": 0.00013800743909087032, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/mean_length": 399.75, "completions/min_length": 367.0, "epoch": 9.330882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.015264902263879776, "kl": 0.007910015992820263, "learning_rate": 6.436065060617566e-07, "loss": 7.874779112171382e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/mean_length": 453.75, "completions/min_length": 379.0, "epoch": 9.33235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9050871133804321, "kl": 0.011586414417251945, "learning_rate": 6.434835753891079e-07, "loss": 0.00011456701759016141, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 6346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 463.875, "completions/min_length": 400.0, "epoch": 9.333823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.1866413354873657, "kl": 0.013182567432522774, "learning_rate": 6.433606352641246e-07, "loss": 0.00013318714627530426, "reward": 0.4000000059604645, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 431.8125, "completions/min_length": 381.0, "epoch": 9.33529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8118485808372498, "kl": 0.008557049790397286, "learning_rate": 6.432376856949052e-07, "loss": 8.565932512283325e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 447.25, "completions/min_length": 351.0, "epoch": 9.336764705882352, "frac_reward_zero_std": 1.0, "grad_norm": 0.01378019992262125, "kl": 0.01295641832984984, "learning_rate": 6.431147266895496e-07, "loss": 0.00013039621990174055, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/mean_length": 457.9375, "completions/min_length": 381.0, "epoch": 9.338235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.010210420936346054, "kl": 0.008095224970020354, "learning_rate": 6.429917582561581e-07, "loss": 8.104000153252855e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 484.625, "completions/min_length": 408.0, "epoch": 9.339705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.0266842320561409, "kl": 0.009102633339352906, "learning_rate": 6.428687804028315e-07, "loss": 9.175765444524586e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 448.0, "completions/min_length": 371.0, "epoch": 9.341176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.9694388508796692, "kl": 0.014940149034373462, "learning_rate": 6.427457931376711e-07, "loss": 0.00014908368757460266, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 6352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 468.375, "completions/min_length": 403.0, "epoch": 9.342647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.9289654493331909, "kl": 0.010768747655674815, "learning_rate": 6.426227964687791e-07, "loss": 0.00010925067181233317, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 6353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 484.375, "completions/min_length": 449.0, "epoch": 9.344117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.7989965677261353, "kl": 0.00842038239352405, "learning_rate": 6.424997904042581e-07, "loss": 8.394614269491285e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 452.3125, "completions/min_length": 403.0, "epoch": 9.345588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.017572147771716118, "kl": 0.010158068500459194, "learning_rate": 6.423767749522116e-07, "loss": 0.00010108168498845771, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 432.0625, "completions/min_length": 344.0, "epoch": 9.347058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.019938115030527115, "kl": 0.010310281999409199, "learning_rate": 6.422537501207434e-07, "loss": 0.00010307361662853509, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 476.9375, "completions/min_length": 395.0, "epoch": 9.348529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.7360900044441223, "kl": 0.010108440183103085, "learning_rate": 6.421307159179584e-07, "loss": 0.00010044127702713013, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/mean_length": 517.9375, "completions/min_length": 461.0, "epoch": 9.35, "frac_reward_zero_std": 0.0, "grad_norm": 1.5871477127075195, "kl": 0.010001339484006166, "learning_rate": 6.420076723519614e-07, "loss": 0.00010082125663757324, "reward": 0.7316083312034607, "reward_std": 0.3349636197090149, "rewards/DrugCombAccuracyCOTORM/mean": 0.6866458654403687, "rewards/DrugCombAccuracyCOTORM/std": 0.4204879701137543, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8229166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.28198206424713135, "step": 6358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 453.875, "completions/min_length": 369.0, "epoch": 9.351470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.2237671613693237, "kl": 0.01506966003216803, "learning_rate": 6.418846194308583e-07, "loss": 0.00014800092321820557, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 6359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 463.375, "completions/min_length": 379.0, "epoch": 9.352941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.01567060500383377, "kl": 0.00762837182264775, "learning_rate": 6.417615571627554e-07, "loss": 7.494372403016314e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 474.0625, "completions/min_length": 391.0, "epoch": 9.354411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.015562922693789005, "kl": 0.00918258458841592, "learning_rate": 6.416384855557599e-07, "loss": 9.153310384135693e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/mean_length": 400.8125, "completions/min_length": 354.0, "epoch": 9.355882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.018710393458604813, "kl": 0.008158233715221286, "learning_rate": 6.415154046179796e-07, "loss": 8.091253403108567e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 461.5, "completions/min_length": 423.0, "epoch": 9.35735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0812649726867676, "kl": 0.009725023410283029, "learning_rate": 6.413923143575225e-07, "loss": 9.791553020477295e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/mean_length": 407.1875, "completions/min_length": 380.0, "epoch": 9.358823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.03505435213446617, "kl": 0.007968429243192077, "learning_rate": 6.412692147824975e-07, "loss": 7.961965457070619e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 476.875, "completions/min_length": 401.0, "epoch": 9.360294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 1.0801093578338623, "kl": 0.014139016391709447, "learning_rate": 6.411461059010141e-07, "loss": 0.0001406985684297979, "reward": 0.30000001192092896, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.125, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 445.9375, "completions/min_length": 397.0, "epoch": 9.361764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.024169335141777992, "kl": 0.009246117318980396, "learning_rate": 6.410229877211825e-07, "loss": 9.205008973367512e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/mean_length": 525.3125, "completions/min_length": 426.0, "epoch": 9.363235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9488604664802551, "kl": 0.01203056238591671, "learning_rate": 6.408998602511134e-07, "loss": 0.00011924379214178771, "reward": 0.36288541555404663, "reward_std": 0.1112755686044693, "rewards/DrugCombAccuracyCOTORM/mean": 0.2709895968437195, "rewards/DrugCombAccuracyCOTORM/std": 0.33682137727737427, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4609375, "rewards/DrugCombCoverageCOTORM/std": 0.4781011939048767, "step": 6367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 437.5625, "completions/min_length": 397.0, "epoch": 9.364705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.014503788203001022, "kl": 0.00893733510747552, "learning_rate": 6.407767234989181e-07, "loss": 8.9695502538234e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 435.5625, "completions/min_length": 375.0, "epoch": 9.366176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.035629142075777054, "kl": 0.010289595229551196, "learning_rate": 6.406535774727085e-07, "loss": 0.00010305364412488416, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 438.4375, "completions/min_length": 388.0, "epoch": 9.367647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.9245631694793701, "kl": 0.010155918425880373, "learning_rate": 6.40530422180597e-07, "loss": 0.00010185980499954894, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 473.6875, "completions/min_length": 437.0, "epoch": 9.369117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.0025784969329834, "kl": 0.01240536104887724, "learning_rate": 6.404072576306971e-07, "loss": 0.00012326263822615147, "reward": 0.48749998211860657, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 6371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 407.125, "completions/min_length": 351.0, "epoch": 9.370588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.012150965631008148, "kl": 0.006983868079259992, "learning_rate": 6.402840838311223e-07, "loss": 7.030973210930824e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 421.25, "completions/min_length": 353.0, "epoch": 9.37205882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.014981625601649284, "kl": 0.008449746412225068, "learning_rate": 6.40160900789987e-07, "loss": 8.477625669911504e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 420.0625, "completions/min_length": 338.0, "epoch": 9.373529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.012395357713103294, "kl": 0.010335348080843687, "learning_rate": 6.400377085154065e-07, "loss": 0.000103432968899142, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 425.6875, "completions/min_length": 373.0, "epoch": 9.375, "frac_reward_zero_std": 1.0, "grad_norm": 0.020091701298952103, "kl": 0.008588580298237503, "learning_rate": 6.39914507015496e-07, "loss": 8.593696111347526e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 473.0, "completions/min_length": 430.0, "epoch": 9.376470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.013596204109489918, "kl": 0.009915619622915983, "learning_rate": 6.397912962983719e-07, "loss": 9.97545721475035e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/mean_length": 502.3125, "completions/min_length": 393.0, "epoch": 9.37794117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.8326756954193115, "kl": 0.00885528838261962, "learning_rate": 6.396680763721509e-07, "loss": 8.752718713367358e-05, "reward": 0.9588750004768372, "reward_std": 0.056757885962724686, "rewards/DrugCombAccuracyCOTORM/mean": 0.9524999856948853, "rewards/DrugCombAccuracyCOTORM/std": 0.10212194174528122, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.06718549132347107, "step": 6377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 458.9375, "completions/min_length": 387.0, "epoch": 9.379411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.011450985446572304, "kl": 0.008772151079028845, "learning_rate": 6.395448472449506e-07, "loss": 8.749873086344451e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 430.3125, "completions/min_length": 378.0, "epoch": 9.380882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.011772525496780872, "kl": 0.008476827177219093, "learning_rate": 6.394216089248888e-07, "loss": 8.486222213832662e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 464.3125, "completions/min_length": 377.0, "epoch": 9.382352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.511526107788086, "kl": 0.01360113825649023, "learning_rate": 6.392983614200843e-07, "loss": 0.00013615936040878296, "reward": 0.6937500238418579, "reward_std": 0.4232165217399597, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 403.5, "completions/min_length": 344.0, "epoch": 9.383823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 1.0497400760650635, "kl": 0.013005118118599057, "learning_rate": 6.391751047386563e-07, "loss": 0.0001315101981163025, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 6381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 472.375, "completions/min_length": 399.0, "epoch": 9.385294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.9734022617340088, "kl": 0.010160016128793359, "learning_rate": 6.390518388887245e-07, "loss": 0.00010202007979387417, "reward": 0.9423294067382812, "reward_std": 0.1399787962436676, "rewards/DrugCombAccuracyCOTORM/mean": 0.9279117584228516, "rewards/DrugCombAccuracyCOTORM/std": 0.25038474798202515, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 456.875, "completions/min_length": 407.0, "epoch": 9.386764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.970156729221344, "kl": 0.010592282749712467, "learning_rate": 6.389285638784096e-07, "loss": 0.00010491907596588135, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/mean_length": 532.375, "completions/min_length": 491.0, "epoch": 9.388235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.011301562190055847, "kl": 0.008874276070855558, "learning_rate": 6.388052797158323e-07, "loss": 8.868255099514499e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 451.75, "completions/min_length": 357.0, "epoch": 9.389705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.020589632913470268, "kl": 0.009378119022585452, "learning_rate": 6.386819864091145e-07, "loss": 9.398167458130047e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/mean_length": 504.3125, "completions/min_length": 424.0, "epoch": 9.391176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 1.14402437210083, "kl": 0.010665555018931627, "learning_rate": 6.385586839663785e-07, "loss": 0.0001062154769897461, "reward": 0.9301249980926514, "reward_std": 0.19763633608818054, "rewards/DrugCombAccuracyCOTORM/mean": 0.9165624976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.23605592548847198, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 6386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 477.0, "completions/min_length": 423.0, "epoch": 9.39264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.016881508752703667, "kl": 0.010826077777892351, "learning_rate": 6.384353723957471e-07, "loss": 0.0001083169408957474, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 419.9375, "completions/min_length": 359.0, "epoch": 9.394117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.01116831786930561, "kl": 0.008074284065514803, "learning_rate": 6.383120517053435e-07, "loss": 8.078562677837908e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 485.25, "completions/min_length": 399.0, "epoch": 9.395588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.7398633360862732, "kl": 0.009577620076015592, "learning_rate": 6.381887219032921e-07, "loss": 9.576976299285889e-05, "reward": 0.8633333444595337, "reward_std": 0.11313708871603012, "rewards/DrugCombAccuracyCOTORM/mean": 0.8500000238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.24765567481517792, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 6389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 489.3125, "completions/min_length": 447.0, "epoch": 9.397058823529411, "frac_reward_zero_std": 0.0, "grad_norm": 1.4993376731872559, "kl": 0.012490265304222703, "learning_rate": 6.380653829977173e-07, "loss": 0.0001246035099029541, "reward": 0.643750011920929, "reward_std": 0.3980313539505005, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/mean_length": 481.875, "completions/min_length": 401.0, "epoch": 9.398529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.652947187423706, "kl": 0.011738693807274103, "learning_rate": 6.379420349967446e-07, "loss": 0.0001177564263343811, "reward": 0.5713750123977661, "reward_std": 0.17836894094944, "rewards/DrugCombAccuracyCOTORM/mean": 0.49937498569488525, "rewards/DrugCombAccuracyCOTORM/std": 0.4599524438381195, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.71875, "rewards/DrugCombCoverageCOTORM/std": 0.3145764470100403, "step": 6391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 474.9375, "completions/min_length": 403.0, "epoch": 9.4, "frac_reward_zero_std": 0.5, "grad_norm": 0.9617298245429993, "kl": 0.01073815138079226, "learning_rate": 6.378186779084995e-07, "loss": 0.00010857864981517196, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 6392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 435.625, "completions/min_length": 383.0, "epoch": 9.401470588235295, "frac_reward_zero_std": 0.0, "grad_norm": 1.5381766557693481, "kl": 0.013938642106950283, "learning_rate": 6.37695311741109e-07, "loss": 0.00013970956206321716, "reward": 0.7490833401679993, "reward_std": 0.3850402235984802, "rewards/DrugCombAccuracyCOTORM/mean": 0.7150000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.4409988820552826, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7708333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.39849257469177246, "step": 6393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/mean_length": 550.9375, "completions/min_length": 393.0, "epoch": 9.402941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.4923175573349, "kl": 0.012914260616526008, "learning_rate": 6.375719365026995e-07, "loss": 0.00012809038162231445, "reward": 0.5678575038909912, "reward_std": 0.29516083002090454, "rewards/DrugCombAccuracyCOTORM/mean": 0.4823913276195526, "rewards/DrugCombAccuracyCOTORM/std": 0.444668710231781, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8194444179534912, "rewards/DrugCombCoverageCOTORM/std": 0.49752476811408997, "step": 6394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 520.6875, "completions/min_length": 461.0, "epoch": 9.404411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.4200315475463867, "kl": 0.009609755361452699, "learning_rate": 6.374485522013993e-07, "loss": 9.670853614807129e-05, "reward": 0.5483125448226929, "reward_std": 0.14619433879852295, "rewards/DrugCombAccuracyCOTORM/mean": 0.4529687464237213, "rewards/DrugCombAccuracyCOTORM/std": 0.5018875002861023, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.859375, "rewards/DrugCombCoverageCOTORM/std": 0.49973952770233154, "step": 6395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 426.5, "completions/min_length": 383.0, "epoch": 9.405882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.011734685860574245, "kl": 0.007874006405472755, "learning_rate": 6.373251588453361e-07, "loss": 7.886825187597424e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 450.625, "completions/min_length": 416.0, "epoch": 9.407352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.01957414299249649, "kl": 0.00759943132288754, "learning_rate": 6.372017564426389e-07, "loss": 7.49787941458635e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 439.0, "completions/min_length": 402.0, "epoch": 9.408823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.1806601285934448, "kl": 0.00851064664311707, "learning_rate": 6.370783450014374e-07, "loss": 8.547801553504542e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/mean_length": 464.0, "completions/min_length": 338.0, "epoch": 9.410294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.020724786445498466, "kl": 0.009572344017215073, "learning_rate": 6.369549245298613e-07, "loss": 9.597194730304182e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 6399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 465.8125, "completions/min_length": 401.0, "epoch": 9.411764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.00966455228626728, "kl": 0.008233271539211273, "learning_rate": 6.368314950360415e-07, "loss": 8.196089038392529e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 6400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 423.375, "completions/min_length": 361.0, "epoch": 9.413235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.007036479189991951, "kl": 0.0058888227213174105, "learning_rate": 6.367080565281089e-07, "loss": 5.861304816789925e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 6401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 427.6875, "completions/min_length": 348.0, "epoch": 9.41470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.026410745456814766, "kl": 0.01037957752123475, "learning_rate": 6.365846090141957e-07, "loss": 0.000104345272120554, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 434.375, "completions/min_length": 376.0, "epoch": 9.416176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.06969305872917175, "kl": 0.012372419703751802, "learning_rate": 6.36461152502434e-07, "loss": 0.00012373975187074393, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 426.125, "completions/min_length": 371.0, "epoch": 9.41764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.2194387912750244, "kl": 0.010113295749761164, "learning_rate": 6.363376870009568e-07, "loss": 0.00010036677122116089, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 456.3125, "completions/min_length": 413.0, "epoch": 9.419117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.856348991394043, "kl": 0.009118340210989118, "learning_rate": 6.36214212517898e-07, "loss": 9.174644947052002e-05, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 6405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/mean_length": 483.6875, "completions/min_length": 361.0, "epoch": 9.420588235294117, "frac_reward_zero_std": 0.0, "grad_norm": 1.5079904794692993, "kl": 0.012883016839623451, "learning_rate": 6.360907290613915e-07, "loss": 0.0001285076141357422, "reward": 0.6158305406570435, "reward_std": 0.1777883768081665, "rewards/DrugCombAccuracyCOTORM/mean": 0.5381041765213013, "rewards/DrugCombAccuracyCOTORM/std": 0.42781952023506165, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8534722328186035, "rewards/DrugCombCoverageCOTORM/std": 0.1682257354259491, "step": 6406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 433.5, "completions/min_length": 398.0, "epoch": 9.422058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.6666500568389893, "kl": 0.02378884912468493, "learning_rate": 6.359672366395722e-07, "loss": 0.0002520931593608111, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 6407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 493.3125, "completions/min_length": 404.0, "epoch": 9.423529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9863300323486328, "kl": 0.009136336855590343, "learning_rate": 6.358437352605754e-07, "loss": 9.090491948882118e-05, "reward": 0.5249999761581421, "reward_std": 0.04629100486636162, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.25, "rewards/DrugCombCoverageCOTORM/std": 1.0, "step": 6408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 485.1875, "completions/min_length": 427.0, "epoch": 9.425, "frac_reward_zero_std": 0.5, "grad_norm": 1.1432297229766846, "kl": 0.013046322157606483, "learning_rate": 6.357202249325371e-07, "loss": 0.00012924743350595236, "reward": 0.8276666402816772, "reward_std": 0.2407720386981964, "rewards/DrugCombAccuracyCOTORM/mean": 0.8262500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.3764195442199707, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6666666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.730296790599823, "step": 6409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/mean_length": 524.4375, "completions/min_length": 450.0, "epoch": 9.426470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 0.9696816205978394, "kl": 0.013991191983222961, "learning_rate": 6.355967056635939e-07, "loss": 0.00014182180166244507, "reward": 0.8333333730697632, "reward_std": 0.1553286463022232, "rewards/DrugCombAccuracyCOTORM/mean": 0.7916666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/mean_length": 452.4375, "completions/min_length": 332.0, "epoch": 9.427941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9382824897766113, "kl": 0.00818091572728008, "learning_rate": 6.354731774618829e-07, "loss": 8.164800965460017e-05, "reward": 0.7945833206176758, "reward_std": 0.17010116577148438, "rewards/DrugCombAccuracyCOTORM/mean": 0.7562500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.3733965754508972, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.15957117080688477, "step": 6411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 466.6875, "completions/min_length": 367.0, "epoch": 9.429411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.03454245254397392, "kl": 0.01246330514550209, "learning_rate": 6.353496403355419e-07, "loss": 0.00012436442193575203, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 453.375, "completions/min_length": 379.0, "epoch": 9.430882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.015821905806660652, "kl": 0.012579424306750298, "learning_rate": 6.352260942927088e-07, "loss": 0.0001246140745934099, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 454.5625, "completions/min_length": 391.0, "epoch": 9.43235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0661916732788086, "kl": 0.010754179558716714, "learning_rate": 6.35102539341523e-07, "loss": 0.00010682048741728067, "reward": 0.5949166417121887, "reward_std": 0.0552385114133358, "rewards/DrugCombAccuracyCOTORM/mean": 0.5274999737739563, "rewards/DrugCombAccuracyCOTORM/std": 0.49293002486228943, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7291666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.49018141627311707, "step": 6414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 463.3125, "completions/min_length": 407.0, "epoch": 9.433823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.1147618293762207, "kl": 0.010760535020381212, "learning_rate": 6.349789754901238e-07, "loss": 0.00010833144187927246, "reward": 0.8464166522026062, "reward_std": 0.21689993143081665, "rewards/DrugCombAccuracyCOTORM/mean": 0.8262500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.3764195442199707, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5013870000839233, "step": 6415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 450.375, "completions/min_length": 396.0, "epoch": 9.435294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.010726602748036385, "kl": 0.010490147513337433, "learning_rate": 6.348554027466512e-07, "loss": 0.00010446006490383297, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 505.4375, "completions/min_length": 459.0, "epoch": 9.436764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.018429532647132874, "kl": 0.009683703305199742, "learning_rate": 6.347318211192463e-07, "loss": 9.66255902312696e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 464.1875, "completions/min_length": 402.0, "epoch": 9.438235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 1.0813077688217163, "kl": 0.012404377339407802, "learning_rate": 6.346082306160497e-07, "loss": 0.00012499417061917484, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 421.875, "completions/min_length": 374.0, "epoch": 9.439705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.0407679080963135, "kl": 0.01416371250525117, "learning_rate": 6.344846312452036e-07, "loss": 0.00014053285121917725, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 437.25, "completions/min_length": 404.0, "epoch": 9.441176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.020207630470395088, "kl": 0.010998780839145184, "learning_rate": 6.343610230148502e-07, "loss": 0.00010978720820276067, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 491.4375, "completions/min_length": 420.0, "epoch": 9.44264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9038731455802917, "kl": 0.008846773533150554, "learning_rate": 6.342374059331327e-07, "loss": 8.83563407114707e-05, "reward": 0.7320833206176758, "reward_std": 0.17561084032058716, "rewards/DrugCombAccuracyCOTORM/mean": 0.6937500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.41161268949508667, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7708333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.49767982959747314, "step": 6421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 484.1875, "completions/min_length": 412.0, "epoch": 9.444117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.9092094302177429, "kl": 0.011975149973295629, "learning_rate": 6.341137800081946e-07, "loss": 0.00012469069042708725, "reward": 0.8979166746139526, "reward_std": 0.1890740841627121, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 6422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 460.125, "completions/min_length": 381.0, "epoch": 9.445588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.04409739747643471, "kl": 0.011204234091565013, "learning_rate": 6.339901452481801e-07, "loss": 0.00011161658767377958, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/mean_length": 524.1875, "completions/min_length": 448.0, "epoch": 9.447058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.974665641784668, "kl": 0.015719785122200847, "learning_rate": 6.338665016612341e-07, "loss": 0.00015456229448318481, "reward": 0.44329166412353516, "reward_std": 0.25316542387008667, "rewards/DrugCombAccuracyCOTORM/mean": 0.32624998688697815, "rewards/DrugCombAccuracyCOTORM/std": 0.47225525975227356, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8229166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.3520771861076355, "step": 6424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 447.625, "completions/min_length": 363.0, "epoch": 9.448529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 1.2075889110565186, "kl": 0.01198739418759942, "learning_rate": 6.337428492555015e-07, "loss": 0.00012087076902389526, "reward": 0.737333357334137, "reward_std": 0.19210051000118256, "rewards/DrugCombAccuracyCOTORM/mean": 0.67166668176651, "rewards/DrugCombAccuracyCOTORM/std": 0.4718286097049713, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 470.125, "completions/min_length": 404.0, "epoch": 9.45, "frac_reward_zero_std": 0.5, "grad_norm": 0.9247981905937195, "kl": 0.011461029760539532, "learning_rate": 6.336191880391284e-07, "loss": 0.00011511376214912161, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 470.75, "completions/min_length": 377.0, "epoch": 9.451470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.1731077432632446, "kl": 0.01034834049642086, "learning_rate": 6.334955180202614e-07, "loss": 0.00010398775339126587, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 6427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 410.9375, "completions/min_length": 357.0, "epoch": 9.452941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.012536004185676575, "kl": 0.006962922983802855, "learning_rate": 6.333718392070476e-07, "loss": 6.903480971232057e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 448.6875, "completions/min_length": 409.0, "epoch": 9.454411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.948733389377594, "kl": 0.01016074197832495, "learning_rate": 6.332481516076345e-07, "loss": 0.00010184769053012133, "reward": 0.6875, "reward_std": 0.19594095647335052, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 440.6875, "completions/min_length": 353.0, "epoch": 9.455882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 1.1017169952392578, "kl": 0.008541911141946912, "learning_rate": 6.331244552301704e-07, "loss": 8.530914783477783e-05, "reward": 0.5600833296775818, "reward_std": 0.2046433538198471, "rewards/DrugCombAccuracyCOTORM/mean": 0.47874999046325684, "rewards/DrugCombAccuracyCOTORM/std": 0.48152363300323486, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7708333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.26440009474754333, "step": 6430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 427.8125, "completions/min_length": 390.0, "epoch": 9.45735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 3.230638027191162, "kl": 0.011091293767094612, "learning_rate": 6.33000750082804e-07, "loss": 0.00011139363050460815, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 465.0625, "completions/min_length": 414.0, "epoch": 9.458823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.029120637103915215, "kl": 0.010110035538673401, "learning_rate": 6.328770361736849e-07, "loss": 0.00010098641359945759, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/mean_length": 454.75, "completions/min_length": 375.0, "epoch": 9.46029411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.0602253675460815, "kl": 0.012579672504216433, "learning_rate": 6.327533135109629e-07, "loss": 0.00012510002125054598, "reward": 0.5232666730880737, "reward_std": 0.04533372446894646, "rewards/DrugCombAccuracyCOTORM/mean": 0.5082499980926514, "rewards/DrugCombAccuracyCOTORM/std": 0.5088768601417542, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.1666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.9583937525749207, "step": 6433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 408.8125, "completions/min_length": 324.0, "epoch": 9.461764705882352, "frac_reward_zero_std": 1.0, "grad_norm": 0.013192558661103249, "kl": 0.009314726805314422, "learning_rate": 6.326295821027887e-07, "loss": 9.297554061049595e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/mean_length": 499.9375, "completions/min_length": 412.0, "epoch": 9.463235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.4218885898590088, "kl": 0.011485852766782045, "learning_rate": 6.32505841957313e-07, "loss": 0.00011500716209411621, "reward": 0.6635416746139526, "reward_std": 0.3470265865325928, "rewards/DrugCombAccuracyCOTORM/mean": 0.6145833730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.40239447355270386, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.71875, "rewards/DrugCombCoverageCOTORM/std": 0.5153881907463074, "step": 6435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/mean_length": 510.6875, "completions/min_length": 390.0, "epoch": 9.464705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.0054455995559692, "kl": 0.011223069741390646, "learning_rate": 6.323820930826879e-07, "loss": 0.00011288054520264268, "reward": 0.942187488079071, "reward_std": 0.16351844370365143, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/mean_length": 504.0, "completions/min_length": 398.0, "epoch": 9.466176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.2822978496551514, "kl": 0.014087414368987083, "learning_rate": 6.322583354870655e-07, "loss": 0.00014160573482513428, "reward": 0.3772083520889282, "reward_std": 0.28046947717666626, "rewards/DrugCombAccuracyCOTORM/mean": 0.2462500035762787, "rewards/DrugCombAccuracyCOTORM/std": 0.39533179998397827, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8020833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.2561737895011902, "step": 6437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 442.4375, "completions/min_length": 363.0, "epoch": 9.467647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.013330068439245224, "kl": 0.008361766231246293, "learning_rate": 6.321345691785987e-07, "loss": 8.346751565113664e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 432.25, "completions/min_length": 375.0, "epoch": 9.469117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.014727850444614887, "kl": 0.0083834562683478, "learning_rate": 6.320107941654409e-07, "loss": 8.308707765536383e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 469.1875, "completions/min_length": 420.0, "epoch": 9.470588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.2291154861450195, "kl": 0.01611707452684641, "learning_rate": 6.318870104557459e-07, "loss": 0.00015913695096969604, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 438.625, "completions/min_length": 351.0, "epoch": 9.472058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.0051289796829224, "kl": 0.009457736741751432, "learning_rate": 6.317632180576686e-07, "loss": 9.428097837371752e-05, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 6441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 399.375, "completions/min_length": 336.0, "epoch": 9.473529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.01706940494477749, "kl": 0.011201137444004416, "learning_rate": 6.316394169793637e-07, "loss": 0.0001117974316002801, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 6442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 495.8125, "completions/min_length": 419.0, "epoch": 9.475, "frac_reward_zero_std": 0.5, "grad_norm": 0.8677253723144531, "kl": 0.010370032512582839, "learning_rate": 6.315156072289873e-07, "loss": 0.00010402704356238246, "reward": 0.9175000190734863, "reward_std": 0.15301470458507538, "rewards/DrugCombAccuracyCOTORM/mean": 0.9007812738418579, "rewards/DrugCombAccuracyCOTORM/std": 0.27153533697128296, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.08539126068353653, "step": 6443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 477.625, "completions/min_length": 397.0, "epoch": 9.476470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.4740709066390991, "kl": 0.010028887307271361, "learning_rate": 6.313917888146955e-07, "loss": 0.00010026246309280396, "reward": 0.824999988079071, "reward_std": 0.37287637591362, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 6444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 455.6875, "completions/min_length": 363.0, "epoch": 9.477941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 1.0207650661468506, "kl": 0.009079564595595002, "learning_rate": 6.312679617446452e-07, "loss": 9.124830103246495e-05, "reward": 0.637499988079071, "reward_std": 0.22638463973999023, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 6445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 439.125, "completions/min_length": 369.0, "epoch": 9.479411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.012827789410948753, "kl": 0.008669284987263381, "learning_rate": 6.311441260269938e-07, "loss": 8.683474879944697e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/mean_length": 517.5, "completions/min_length": 475.0, "epoch": 9.480882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.015852058306336403, "kl": 0.009983052965253592, "learning_rate": 6.310202816698993e-07, "loss": 9.958634473150596e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/mean_length": 500.6875, "completions/min_length": 422.0, "epoch": 9.48235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.4137848615646362, "kl": 0.010336526087485254, "learning_rate": 6.308964286815202e-07, "loss": 0.00010363012552261353, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 419.3125, "completions/min_length": 333.0, "epoch": 9.483823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.025613119825720787, "kl": 0.008722379920072854, "learning_rate": 6.307725670700155e-07, "loss": 8.753228757996112e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/mean_length": 390.0, "completions/min_length": 345.0, "epoch": 9.485294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.018237987533211708, "kl": 0.010419034864753485, "learning_rate": 6.306486968435451e-07, "loss": 0.00010394700802862644, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 423.5625, "completions/min_length": 365.0, "epoch": 9.486764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.02548086643218994, "kl": 0.008172030793502927, "learning_rate": 6.305248180102693e-07, "loss": 8.132337097777054e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 429.25, "completions/min_length": 360.0, "epoch": 9.488235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.2688050270080566, "kl": 0.013119458453729749, "learning_rate": 6.304009305783486e-07, "loss": 0.00013168901205062866, "reward": 0.36994582414627075, "reward_std": 0.22310324013233185, "rewards/DrugCombAccuracyCOTORM/mean": 0.26490622758865356, "rewards/DrugCombAccuracyCOTORM/std": 0.30058789253234863, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5802083611488342, "rewards/DrugCombCoverageCOTORM/std": 0.449576735496521, "step": 6452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 454.3125, "completions/min_length": 392.0, "epoch": 9.489705882352942, "frac_reward_zero_std": 0.0, "grad_norm": 1.4023027420043945, "kl": 0.01022400357760489, "learning_rate": 6.302770345559447e-07, "loss": 0.00010278820991516113, "reward": 0.3999999761581421, "reward_std": 0.41874149441719055, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 6453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/mean_length": 492.0, "completions/min_length": 399.0, "epoch": 9.491176470588234, "frac_reward_zero_std": 0.5, "grad_norm": 1.2161179780960083, "kl": 0.007580709992907941, "learning_rate": 6.301531299512194e-07, "loss": 7.560290396213531e-05, "reward": 0.8988749980926514, "reward_std": 0.18881995975971222, "rewards/DrugCombAccuracyCOTORM/mean": 0.8853124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.314830482006073, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.2719528079032898, "step": 6454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 452.1875, "completions/min_length": 375.0, "epoch": 9.492647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.033414602279663, "kl": 0.011590692680329084, "learning_rate": 6.300292167723353e-07, "loss": 0.00011490906763356179, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 6455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 411.6875, "completions/min_length": 353.0, "epoch": 9.494117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.037533510476350784, "kl": 0.010250447667203844, "learning_rate": 6.299052950274554e-07, "loss": 0.00010147823195438832, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 491.0, "completions/min_length": 451.0, "epoch": 9.495588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.018771667033433914, "kl": 0.010525933001190424, "learning_rate": 6.297813647247437e-07, "loss": 0.00010449934052303433, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 467.5, "completions/min_length": 400.0, "epoch": 9.49705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.8368655443191528, "kl": 0.008455651928670704, "learning_rate": 6.296574258723639e-07, "loss": 8.479334064759314e-05, "reward": 0.7124166488647461, "reward_std": 0.11620121449232101, "rewards/DrugCombAccuracyCOTORM/mean": 0.6587499976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.3996310830116272, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.17078250646591187, "step": 6458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 508.9375, "completions/min_length": 433.0, "epoch": 9.498529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.03908544033765793, "kl": 0.010424005798995495, "learning_rate": 6.295334784784809e-07, "loss": 0.00010331695375498384, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/mean_length": 472.625, "completions/min_length": 390.0, "epoch": 9.5, "frac_reward_zero_std": 0.0, "grad_norm": 1.3229066133499146, "kl": 0.015083085745573044, "learning_rate": 6.294095225512604e-07, "loss": 0.00015061721205711365, "reward": 0.34687501192092896, "reward_std": 0.2706533968448639, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.46875, "rewards/DrugCombCoverageCOTORM/std": 0.879749059677124, "step": 6460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/mean_length": 510.875, "completions/min_length": 417.0, "epoch": 9.501470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.0377386808395386, "kl": 0.011760724009945989, "learning_rate": 6.292855580988679e-07, "loss": 0.00011810711293946952, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 447.5, "completions/min_length": 364.0, "epoch": 9.50294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.012228687293827534, "kl": 0.008319351356476545, "learning_rate": 6.2916158512947e-07, "loss": 8.315950253745541e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 468.9375, "completions/min_length": 385.0, "epoch": 9.504411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9625467658042908, "kl": 0.009863253799267113, "learning_rate": 6.290376036512338e-07, "loss": 9.870901703834534e-05, "reward": 0.6979166865348816, "reward_std": 0.1732480525970459, "rewards/DrugCombAccuracyCOTORM/mean": 0.6458333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.4629814922809601, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 6463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 479.375, "completions/min_length": 404.0, "epoch": 9.505882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.696772575378418, "kl": 0.007412767503410578, "learning_rate": 6.289136136723268e-07, "loss": 7.419098255923018e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 429.125, "completions/min_length": 347.0, "epoch": 9.507352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.9172527194023132, "kl": 0.0075525506399571896, "learning_rate": 6.287896152009171e-07, "loss": 7.651421037735417e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 494.6875, "completions/min_length": 393.0, "epoch": 9.508823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 1.4547215700149536, "kl": 0.010060374159365892, "learning_rate": 6.286656082451736e-07, "loss": 0.00010055303573608398, "reward": 0.6144166588783264, "reward_std": 0.290227472782135, "rewards/DrugCombAccuracyCOTORM/mean": 0.5258333683013916, "rewards/DrugCombAccuracyCOTORM/std": 0.48839497566223145, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 452.0625, "completions/min_length": 383.0, "epoch": 9.510294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.8023499250411987, "kl": 0.009038830758072436, "learning_rate": 6.285415928132653e-07, "loss": 8.952617645263672e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 437.625, "completions/min_length": 383.0, "epoch": 9.511764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 3.6742470264434814, "kl": 0.013550746953114867, "learning_rate": 6.284175689133623e-07, "loss": 0.00013349950313568115, "reward": 0.7937500476837158, "reward_std": 0.36611872911453247, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/mean_length": 511.8125, "completions/min_length": 368.0, "epoch": 9.513235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.1285412311553955, "kl": 0.008210620610043406, "learning_rate": 6.28293536553635e-07, "loss": 8.106976747512817e-05, "reward": 0.7211999893188477, "reward_std": 0.35942310094833374, "rewards/DrugCombAccuracyCOTORM/mean": 0.6624374985694885, "rewards/DrugCombAccuracyCOTORM/std": 0.4717079699039459, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9125000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.2526525855064392, "step": 6469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/mean_length": 505.6875, "completions/min_length": 398.0, "epoch": 9.514705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 0.9361937046051025, "kl": 0.009472634177654982, "learning_rate": 6.281694957422539e-07, "loss": 9.488966315984726e-05, "reward": 0.6937500238418579, "reward_std": 0.1898072361946106, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 469.6875, "completions/min_length": 423.0, "epoch": 9.516176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.1265175342559814, "kl": 0.026987999444827437, "learning_rate": 6.280454464873908e-07, "loss": 0.0002672523260116577, "reward": 0.9750000238418579, "reward_std": 0.0707106739282608, "rewards/DrugCombAccuracyCOTORM/mean": 0.96875, "rewards/DrugCombAccuracyCOTORM/std": 0.125, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 423.6875, "completions/min_length": 370.0, "epoch": 9.51764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.010928859002888203, "kl": 0.007928177597932518, "learning_rate": 6.279213887972178e-07, "loss": 7.94773586676456e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 461.625, "completions/min_length": 395.0, "epoch": 9.519117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.8144680261611938, "kl": 0.011181566631421447, "learning_rate": 6.277973226799074e-07, "loss": 0.0001127924770116806, "reward": 0.7363958358764648, "reward_std": 0.18051396310329437, "rewards/DrugCombAccuracyCOTORM/mean": 0.7069531083106995, "rewards/DrugCombAccuracyCOTORM/std": 0.38879889249801636, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7083333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.6763190627098083, "step": 6473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 495.625, "completions/min_length": 411.0, "epoch": 9.520588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.007719972170889378, "kl": 0.006345552043057978, "learning_rate": 6.276732481436327e-07, "loss": 6.339552055578679e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 435.5, "completions/min_length": 408.0, "epoch": 9.522058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.00983687024563551, "kl": 0.008109913440421224, "learning_rate": 6.275491651965677e-07, "loss": 8.099907427094877e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 449.5625, "completions/min_length": 403.0, "epoch": 9.523529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.6618766784667969, "kl": 0.01084803941193968, "learning_rate": 6.274250738468864e-07, "loss": 0.00010849847603822127, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/mean_length": 479.6875, "completions/min_length": 362.0, "epoch": 9.525, "frac_reward_zero_std": 1.0, "grad_norm": 0.03660516440868378, "kl": 0.01019958162214607, "learning_rate": 6.273009741027637e-07, "loss": 0.00010200896940659732, "reward": 0.05000000074505806, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 6477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 473.5625, "completions/min_length": 406.0, "epoch": 9.526470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.9278790950775146, "kl": 0.00897762447129935, "learning_rate": 6.27176865972375e-07, "loss": 8.97288991836831e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 500.5625, "completions/min_length": 450.0, "epoch": 9.527941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.4076036214828491, "kl": 0.00997255090624094, "learning_rate": 6.270527494638963e-07, "loss": 9.963661432266235e-05, "reward": 0.36288541555404663, "reward_std": 0.20363128185272217, "rewards/DrugCombAccuracyCOTORM/mean": 0.22755581140518188, "rewards/DrugCombAccuracyCOTORM/std": 0.3178594708442688, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8084077835083008, "rewards/DrugCombCoverageCOTORM/std": 0.21130219101905823, "step": 6479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/mean_length": 454.6875, "completions/min_length": 367.0, "epoch": 9.529411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.644941806793213, "kl": 0.014275248162448406, "learning_rate": 6.269286245855038e-07, "loss": 0.00014127790927886963, "reward": 0.7975708246231079, "reward_std": 0.08088906109333038, "rewards/DrugCombAccuracyCOTORM/mean": 0.7645416855812073, "rewards/DrugCombAccuracyCOTORM/std": 0.2804461419582367, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.859375, "rewards/DrugCombCoverageCOTORM/std": 0.1463087499141693, "step": 6480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 462.0, "completions/min_length": 383.0, "epoch": 9.530882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.010080772452056408, "kl": 0.008479675510898232, "learning_rate": 6.268044913453749e-07, "loss": 8.403295942116529e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/mean_length": 398.875, "completions/min_length": 361.0, "epoch": 9.532352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.015327733010053635, "kl": 0.012229772051796317, "learning_rate": 6.26680349751687e-07, "loss": 0.0001221038110088557, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 6482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 427.3125, "completions/min_length": 379.0, "epoch": 9.533823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.008619743399322033, "kl": 0.008119300357066095, "learning_rate": 6.265561998126182e-07, "loss": 8.116687240544707e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 435.5, "completions/min_length": 372.0, "epoch": 9.535294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.049911122769117355, "kl": 0.01213584328070283, "learning_rate": 6.264320415363472e-07, "loss": 0.00012063359463354573, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 441.375, "completions/min_length": 362.0, "epoch": 9.536764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.019146746024489403, "kl": 0.009244471439160407, "learning_rate": 6.263078749310533e-07, "loss": 9.302307444158942e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/mean_length": 486.8125, "completions/min_length": 415.0, "epoch": 9.538235294117648, "frac_reward_zero_std": 0.5, "grad_norm": 1.2068073749542236, "kl": 0.010702777071855962, "learning_rate": 6.261837000049163e-07, "loss": 0.00010595470666885376, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 466.9375, "completions/min_length": 425.0, "epoch": 9.53970588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8888748288154602, "kl": 0.009783323388546705, "learning_rate": 6.260595167661164e-07, "loss": 9.76135052042082e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 489.4375, "completions/min_length": 433.0, "epoch": 9.541176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.8921711444854736, "kl": 0.012460110476240516, "learning_rate": 6.259353252228345e-07, "loss": 0.000126262370031327, "reward": 0.40000003576278687, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 6488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 455.5, "completions/min_length": 402.0, "epoch": 9.54264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.117996096611023, "kl": 0.011440325761213899, "learning_rate": 6.258111253832523e-07, "loss": 0.00011401623487472534, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 433.375, "completions/min_length": 372.0, "epoch": 9.544117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.8488736152648926, "kl": 0.008790895924903452, "learning_rate": 6.256869172555513e-07, "loss": 8.785825775703415e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 477.5, "completions/min_length": 421.0, "epoch": 9.545588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.0052601099014282, "kl": 0.00722433952614665, "learning_rate": 6.255627008479143e-07, "loss": 7.309019565582275e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 430.9375, "completions/min_length": 362.0, "epoch": 9.547058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.03158479928970337, "kl": 0.01087784580886364, "learning_rate": 6.254384761685244e-07, "loss": 0.00010914140875684097, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/mean_length": 492.25, "completions/min_length": 409.0, "epoch": 9.548529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.1810020208358765, "kl": 0.010028486023657024, "learning_rate": 6.253142432255653e-07, "loss": 0.0001004636287689209, "reward": 0.6669055819511414, "reward_std": 0.2842120826244354, "rewards/DrugCombAccuracyCOTORM/mean": 0.6157499551773071, "rewards/DrugCombAccuracyCOTORM/std": 0.38171395659446716, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7430555820465088, "rewards/DrugCombCoverageCOTORM/std": 0.28607964515686035, "step": 6493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 433.5625, "completions/min_length": 315.0, "epoch": 9.55, "frac_reward_zero_std": 1.0, "grad_norm": 0.0185383390635252, "kl": 0.008456525509245694, "learning_rate": 6.251900020272207e-07, "loss": 8.461465768050402e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 428.8125, "completions/min_length": 381.0, "epoch": 9.551470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.03014606609940529, "kl": 0.008591546211391687, "learning_rate": 6.250657525816758e-07, "loss": 8.65603142301552e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/mean_length": 470.25, "completions/min_length": 341.0, "epoch": 9.552941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9049286246299744, "kl": 0.012088058982044458, "learning_rate": 6.249414948971154e-07, "loss": 0.00012279181100893766, "reward": 0.8250000476837158, "reward_std": 0.0707106739282608, "rewards/DrugCombAccuracyCOTORM/mean": 0.78125, "rewards/DrugCombAccuracyCOTORM/std": 0.2561737895011902, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 428.625, "completions/min_length": 392.0, "epoch": 9.554411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.06123480573296547, "kl": 0.012351832818239927, "learning_rate": 6.248172289817256e-07, "loss": 0.00012593565043061972, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/mean_length": 467.375, "completions/min_length": 386.0, "epoch": 9.555882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.02045184001326561, "kl": 0.009845402790233493, "learning_rate": 6.246929548436926e-07, "loss": 9.783589484868571e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 393.0, "completions/min_length": 359.0, "epoch": 9.55735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9888700246810913, "kl": 0.010658312705345452, "learning_rate": 6.245686724912035e-07, "loss": 0.00010685622692108154, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 473.875, "completions/min_length": 387.0, "epoch": 9.558823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.01586766168475151, "kl": 0.011529628420248628, "learning_rate": 6.244443819324453e-07, "loss": 0.00011498882668092847, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 500.125, "completions/min_length": 443.0, "epoch": 9.560294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.0657979249954224, "kl": 0.015042208367958665, "learning_rate": 6.243200831756061e-07, "loss": 0.00015032291412353516, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 491.5625, "completions/min_length": 446.0, "epoch": 9.561764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.025710096582770348, "kl": 0.00943985569756478, "learning_rate": 6.241957762288746e-07, "loss": 9.38374869292602e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 477.375, "completions/min_length": 433.0, "epoch": 9.563235294117646, "frac_reward_zero_std": 0.0, "grad_norm": 1.4413096904754639, "kl": 0.013309480156749487, "learning_rate": 6.240714611004395e-07, "loss": 0.00013277679681777954, "reward": 0.375, "reward_std": 0.3728764057159424, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 6503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 435.0, "completions/min_length": 367.0, "epoch": 9.564705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9307416677474976, "kl": 0.014797115232795477, "learning_rate": 6.239471377984906e-07, "loss": 0.00014925190771464258, "reward": 0.5249999761581421, "reward_std": 0.04629100486636162, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.25, "rewards/DrugCombCoverageCOTORM/std": 1.0, "step": 6504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 447.8125, "completions/min_length": 399.0, "epoch": 9.566176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.012998432852327824, "kl": 0.00931426859460771, "learning_rate": 6.23822806331218e-07, "loss": 9.241988300345838e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 433.875, "completions/min_length": 391.0, "epoch": 9.56764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.009035966359078884, "kl": 0.007881698198616505, "learning_rate": 6.236984667068123e-07, "loss": 7.80202099122107e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/mean_length": 472.8125, "completions/min_length": 384.0, "epoch": 9.569117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.01976269669830799, "kl": 0.01103205606341362, "learning_rate": 6.235741189334645e-07, "loss": 0.00011077664385084063, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 458.4375, "completions/min_length": 392.0, "epoch": 9.570588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.008125219494104385, "kl": 0.007541295955888927, "learning_rate": 6.234497630193665e-07, "loss": 7.51745465095155e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 6508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 439.1875, "completions/min_length": 375.0, "epoch": 9.572058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.01594889722764492, "kl": 0.01035146601498127, "learning_rate": 6.233253989727106e-07, "loss": 0.00010317012493032962, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 491.625, "completions/min_length": 407.0, "epoch": 9.573529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 1.0791159868240356, "kl": 0.013060137280263007, "learning_rate": 6.232010268016894e-07, "loss": 0.00013020634651184082, "reward": 0.7687291502952576, "reward_std": 0.13589029014110565, "rewards/DrugCombAccuracyCOTORM/mean": 0.7317447662353516, "rewards/DrugCombAccuracyCOTORM/std": 0.35778847336769104, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.2357022613286972, "step": 6510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 478.0625, "completions/min_length": 399.0, "epoch": 9.575, "frac_reward_zero_std": 0.5, "grad_norm": 0.9936399459838867, "kl": 0.01011446281336248, "learning_rate": 6.230766465144965e-07, "loss": 0.00010151129390578717, "reward": 0.6339166760444641, "reward_std": 0.15851351618766785, "rewards/DrugCombAccuracyCOTORM/mean": 0.5762500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.49902406334877014, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7291666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.6800735592842102, "step": 6511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 453.75, "completions/min_length": 390.0, "epoch": 9.576470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.008587034419178963, "kl": 0.008119957172311842, "learning_rate": 6.229522581193256e-07, "loss": 8.108557085506618e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/mean_length": 557.9375, "completions/min_length": 427.0, "epoch": 9.577941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 2.3370285034179688, "kl": 0.050858649192377925, "learning_rate": 6.228278616243709e-07, "loss": 0.0004734508693218231, "reward": 0.7586203217506409, "reward_std": 0.2619161605834961, "rewards/DrugCombAccuracyCOTORM/mean": 0.7000114917755127, "rewards/DrugCombAccuracyCOTORM/std": 0.3651427924633026, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9861111044883728, "rewards/DrugCombCoverageCOTORM/std": 0.0555555522441864, "step": 6513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 447.0, "completions/min_length": 405.0, "epoch": 9.579411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.020411303266882896, "kl": 0.007088727317750454, "learning_rate": 6.227034570378277e-07, "loss": 7.081609510350972e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 483.25, "completions/min_length": 451.0, "epoch": 9.580882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.904661238193512, "kl": 0.009964870754629374, "learning_rate": 6.225790443678911e-07, "loss": 9.916722774505615e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 454.4375, "completions/min_length": 365.0, "epoch": 9.58235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.7159907817840576, "kl": 0.012899326160550117, "learning_rate": 6.224546236227574e-07, "loss": 0.00012686103582382202, "reward": 0.606249988079071, "reward_std": 0.36740854382514954, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 6516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/mean_length": 501.0625, "completions/min_length": 429.0, "epoch": 9.583823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 1.346722960472107, "kl": 0.012604170246049762, "learning_rate": 6.22330194810623e-07, "loss": 0.00012592971324920654, "reward": 0.7359374761581421, "reward_std": 0.39229491353034973, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.859375, "rewards/DrugCombCoverageCOTORM/std": 0.49973952770233154, "step": 6517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 430.5625, "completions/min_length": 390.0, "epoch": 9.58529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.1293166875839233, "kl": 0.013120963471010327, "learning_rate": 6.222057579396849e-07, "loss": 0.00013142451643943787, "reward": 0.4312500059604645, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 6518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 511.9375, "completions/min_length": 409.0, "epoch": 9.586764705882352, "frac_reward_zero_std": 0.0, "grad_norm": 1.2879153490066528, "kl": 0.014042856404557824, "learning_rate": 6.220813130181407e-07, "loss": 0.00013918429613113403, "reward": 0.550000011920929, "reward_std": 0.3181980550289154, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 6519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 485.375, "completions/min_length": 435.0, "epoch": 9.588235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9994187355041504, "kl": 0.009998208959586918, "learning_rate": 6.219568600541885e-07, "loss": 0.00010029971599578857, "reward": 0.75, "reward_std": 0.20701967179775238, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 470.9375, "completions/min_length": 413.0, "epoch": 9.589705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.02176346816122532, "kl": 0.010166593361645937, "learning_rate": 6.218323990560271e-07, "loss": 0.00010182542609982193, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/mean_length": 583.0625, "completions/min_length": 526.0, "epoch": 9.591176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.3220220804214478, "kl": 0.009338213130831718, "learning_rate": 6.217079300318555e-07, "loss": 9.329244494438171e-05, "reward": 0.8545833826065063, "reward_std": 0.190956249833107, "rewards/DrugCombAccuracyCOTORM/mean": 0.8260416984558105, "rewards/DrugCombAccuracyCOTORM/std": 0.24114784598350525, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.18130187690258026, "step": 6522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 472.1875, "completions/min_length": 401.0, "epoch": 9.592647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.011883108876645565, "kl": 0.009142827708274126, "learning_rate": 6.215834529898736e-07, "loss": 9.121953189605847e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 460.375, "completions/min_length": 424.0, "epoch": 9.594117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.013750984333455563, "kl": 0.009186987648718059, "learning_rate": 6.214589679382815e-07, "loss": 9.11758397705853e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 490.75, "completions/min_length": 411.0, "epoch": 9.595588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.8364404439926147, "kl": 0.011764808325096965, "learning_rate": 6.213344748852798e-07, "loss": 0.00011691441613947973, "reward": 0.45891666412353516, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.45125001668930054, "rewards/DrugCombAccuracyCOTORM/std": 0.502684473991394, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.020833313465118408, "rewards/DrugCombCoverageCOTORM/std": 1.0144785642623901, "step": 6525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 485.4375, "completions/min_length": 425.0, "epoch": 9.597058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.7695293426513672, "kl": 0.010101985651999712, "learning_rate": 6.2120997383907e-07, "loss": 0.00010145684791496024, "reward": 0.8767499923706055, "reward_std": 0.17010116577148438, "rewards/DrugCombAccuracyCOTORM/mean": 0.8537499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.31442803144454956, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.13437095284461975, "step": 6526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 462.5625, "completions/min_length": 420.0, "epoch": 9.598529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.0229939222335815, "kl": 0.009130755322985351, "learning_rate": 6.210854648078539e-07, "loss": 9.13673939066939e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 420.4375, "completions/min_length": 342.0, "epoch": 9.6, "frac_reward_zero_std": 1.0, "grad_norm": 0.02813154086470604, "kl": 0.010176341631449759, "learning_rate": 6.209609477998338e-07, "loss": 0.00010093134187627584, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 486.0, "completions/min_length": 432.0, "epoch": 9.601470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.009892306290566921, "kl": 0.008480425225570798, "learning_rate": 6.208364228232127e-07, "loss": 8.47746996441856e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 475.375, "completions/min_length": 412.0, "epoch": 9.602941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.01611618883907795, "kl": 0.008987550158053637, "learning_rate": 6.207118898861937e-07, "loss": 9.041930024977773e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/mean_length": 480.5625, "completions/min_length": 416.0, "epoch": 9.604411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.5033693313598633, "kl": 0.013161416864022613, "learning_rate": 6.20587348996981e-07, "loss": 0.00013052672147750854, "reward": 0.668749988079071, "reward_std": 0.3743184804916382, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 6531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 467.8125, "completions/min_length": 414.0, "epoch": 9.605882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.019256725907325745, "kl": 0.013071767869405448, "learning_rate": 6.204628001637787e-07, "loss": 0.00012972029799129814, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 398.1875, "completions/min_length": 368.0, "epoch": 9.60735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.009985141456127167, "kl": 0.009196387138217688, "learning_rate": 6.203382433947921e-07, "loss": 9.20411548577249e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 473.625, "completions/min_length": 413.0, "epoch": 9.608823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.9102672338485718, "kl": 0.011534973746165633, "learning_rate": 6.202136786982266e-07, "loss": 0.00011510401964187622, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/mean_length": 478.375, "completions/min_length": 400.0, "epoch": 9.610294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.8733599781990051, "kl": 0.011454251362010837, "learning_rate": 6.200891060822883e-07, "loss": 0.00011410054139560089, "reward": 0.6625000238418579, "reward_std": 0.2133909910917282, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 6535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 450.8125, "completions/min_length": 382.0, "epoch": 9.611764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.021946363151073456, "kl": 0.009794845478609204, "learning_rate": 6.199645255551835e-07, "loss": 9.748540469445288e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 460.75, "completions/min_length": 398.0, "epoch": 9.613235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.013397803530097008, "kl": 0.008816557237878442, "learning_rate": 6.198399371251192e-07, "loss": 8.872344915289432e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 435.75, "completions/min_length": 392.0, "epoch": 9.614705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.02145429700613022, "kl": 0.009189594304189086, "learning_rate": 6.197153408003032e-07, "loss": 9.118887828662992e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/mean_length": 501.875, "completions/min_length": 394.0, "epoch": 9.616176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.1091994047164917, "kl": 0.011676798923872411, "learning_rate": 6.195907365889435e-07, "loss": 0.00011756084859371185, "reward": 0.26133447885513306, "reward_std": 0.16009686887264252, "rewards/DrugCombAccuracyCOTORM/mean": 0.1904701590538025, "rewards/DrugCombAccuracyCOTORM/std": 0.304080605506897, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.08958332985639572, "rewards/DrugCombCoverageCOTORM/std": 0.5366864800453186, "step": 6539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 454.0, "completions/min_length": 397.0, "epoch": 9.617647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.024646125733852386, "kl": 0.01037340343464166, "learning_rate": 6.194661244992487e-07, "loss": 0.00010249041952192783, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 453.3125, "completions/min_length": 397.0, "epoch": 9.619117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.0334341526031494, "kl": 0.01076393062248826, "learning_rate": 6.193415045394281e-07, "loss": 0.00010759418364614248, "reward": 0.737500011920929, "reward_std": 0.219983771443367, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 467.375, "completions/min_length": 427.0, "epoch": 9.620588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.018351197242736816, "kl": 0.010825144359841943, "learning_rate": 6.192168767176912e-07, "loss": 0.0001083496244973503, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/mean_length": 477.625, "completions/min_length": 337.0, "epoch": 9.62205882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.3122464418411255, "kl": 0.010150477522984147, "learning_rate": 6.19092241042248e-07, "loss": 0.00010326040501240641, "reward": 0.7296291589736938, "reward_std": 0.20905083417892456, "rewards/DrugCombAccuracyCOTORM/mean": 0.7082604169845581, "rewards/DrugCombAccuracyCOTORM/std": 0.4210386574268341, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6302083134651184, "rewards/DrugCombCoverageCOTORM/std": 0.7064312696456909, "step": 6543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/mean_length": 467.0, "completions/min_length": 389.0, "epoch": 9.623529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.0703604221343994, "kl": 0.012569868238642812, "learning_rate": 6.189675975213093e-07, "loss": 0.00012649595737457275, "reward": 0.45883333683013916, "reward_std": 0.2295181304216385, "rewards/DrugCombAccuracyCOTORM/mean": 0.36000001430511475, "rewards/DrugCombAccuracyCOTORM/std": 0.39273402094841003, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7083333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.4849589467048645, "step": 6544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 501.375, "completions/min_length": 449.0, "epoch": 9.625, "frac_reward_zero_std": 0.5, "grad_norm": 1.0614839792251587, "kl": 0.00873990380205214, "learning_rate": 6.188429461630865e-07, "loss": 8.710473775863647e-05, "reward": 0.7121636867523193, "reward_std": 0.07258789241313934, "rewards/DrugCombAccuracyCOTORM/mean": 0.6538764834403992, "rewards/DrugCombAccuracyCOTORM/std": 0.3759545683860779, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.890625, "rewards/DrugCombCoverageCOTORM/std": 0.1280868947505951, "step": 6545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 447.5625, "completions/min_length": 375.0, "epoch": 9.626470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.59697425365448, "kl": 0.028652939829044044, "learning_rate": 6.187182869757911e-07, "loss": 0.000292164389975369, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 427.0625, "completions/min_length": 362.0, "epoch": 9.62794117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.03012958914041519, "kl": 0.011850080802105367, "learning_rate": 6.185936199676354e-07, "loss": 0.00011914959759451449, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 510.1875, "completions/min_length": 453.0, "epoch": 9.629411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.089394450187683, "kl": 0.01364411087706685, "learning_rate": 6.184689451468322e-07, "loss": 0.0001340347371296957, "reward": 0.75, "reward_std": 0.26726123690605164, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 6548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/mean_length": 504.5, "completions/min_length": 428.0, "epoch": 9.630882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 1.2479668855667114, "kl": 0.011275999946519732, "learning_rate": 6.183442625215946e-07, "loss": 0.00011288374662399292, "reward": 0.6758333444595337, "reward_std": 0.21416354179382324, "rewards/DrugCombAccuracyCOTORM/mean": 0.6312500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.3196916878223419, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7083333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.21516574919223785, "step": 6549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 483.0625, "completions/min_length": 428.0, "epoch": 9.632352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.1978834867477417, "kl": 0.011280197999440134, "learning_rate": 6.182195721001366e-07, "loss": 0.00011215731501579285, "reward": 0.5142499804496765, "reward_std": 0.2300947606563568, "rewards/DrugCombAccuracyCOTORM/mean": 0.47874999046325684, "rewards/DrugCombAccuracyCOTORM/std": 0.48152363300323486, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3125, "rewards/DrugCombCoverageCOTORM/std": 0.9227073788642883, "step": 6550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 450.25, "completions/min_length": 345.0, "epoch": 9.633823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.8193404078483582, "kl": 0.010243709664791822, "learning_rate": 6.180948738906723e-07, "loss": 0.0001023771837935783, "reward": 0.8500000238418579, "reward_std": 0.2070196568965912, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 492.25, "completions/min_length": 426.0, "epoch": 9.635294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.01649365946650505, "kl": 0.00911984487902373, "learning_rate": 6.179701679014164e-07, "loss": 9.125575888901949e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 472.4375, "completions/min_length": 377.0, "epoch": 9.636764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.010646624490618706, "kl": 0.007996827480383217, "learning_rate": 6.178454541405848e-07, "loss": 7.984443072928116e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/mean_length": 520.125, "completions/min_length": 411.0, "epoch": 9.638235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.409842610359192, "kl": 0.012425424298271537, "learning_rate": 6.177207326163926e-07, "loss": 0.0001253262162208557, "reward": 0.5520833134651184, "reward_std": 0.3809265196323395, "rewards/DrugCombAccuracyCOTORM/mean": 0.4791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.5013870000839233, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.5639641284942627, "step": 6554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 458.125, "completions/min_length": 392.0, "epoch": 9.639705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.1165883541107178, "kl": 0.012112160678952932, "learning_rate": 6.175960033370565e-07, "loss": 0.00012038582644890994, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/mean_length": 533.0, "completions/min_length": 393.0, "epoch": 9.641176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.013082587160170078, "kl": 0.00996893085539341, "learning_rate": 6.174712663107933e-07, "loss": 9.977446461562067e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 6556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 414.9375, "completions/min_length": 367.0, "epoch": 9.64264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.024577302858233452, "kl": 0.010538324480876327, "learning_rate": 6.173465215458202e-07, "loss": 0.0001038838890963234, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/mean_length": 468.8125, "completions/min_length": 385.0, "epoch": 9.644117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.7659632563591003, "kl": 0.016646051313728094, "learning_rate": 6.172217690503553e-07, "loss": 0.0001670922210905701, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 6558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 445.875, "completions/min_length": 419.0, "epoch": 9.645588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.6633307933807373, "kl": 0.014138910453766584, "learning_rate": 6.170970088326173e-07, "loss": 0.000141877681016922, "reward": 0.625, "reward_std": 0.45729556679725647, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 6559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 431.9375, "completions/min_length": 379.0, "epoch": 9.647058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 0.6982199549674988, "kl": 0.009696298278868198, "learning_rate": 6.169722409008244e-07, "loss": 9.74023641902022e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 461.375, "completions/min_length": 381.0, "epoch": 9.648529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 2.8856594562530518, "kl": 0.016365298070013523, "learning_rate": 6.168474652631962e-07, "loss": 0.00016354769468307495, "reward": 0.8142499923706055, "reward_std": 0.3468778729438782, "rewards/DrugCombAccuracyCOTORM/mean": 0.7912499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.3766497075557709, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5013870000839233, "step": 6561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 415.5625, "completions/min_length": 384.0, "epoch": 9.65, "frac_reward_zero_std": 1.0, "grad_norm": 0.009524315595626831, "kl": 0.007136130356229842, "learning_rate": 6.167226819279527e-07, "loss": 7.122494571376592e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 443.6875, "completions/min_length": 346.0, "epoch": 9.651470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.017074212431907654, "kl": 0.009637123555876315, "learning_rate": 6.165978909033144e-07, "loss": 9.545606735628098e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 447.1875, "completions/min_length": 412.0, "epoch": 9.652941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.016866644844412804, "kl": 0.00966876873280853, "learning_rate": 6.16473092197502e-07, "loss": 9.62655758485198e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 509.875, "completions/min_length": 438.0, "epoch": 9.654411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9957692623138428, "kl": 0.010662692598998547, "learning_rate": 6.163482858187372e-07, "loss": 0.00010681897401809692, "reward": 0.8305134773254395, "reward_std": 0.11839533597230911, "rewards/DrugCombAccuracyCOTORM/mean": 0.8143137097358704, "rewards/DrugCombAccuracyCOTORM/std": 0.25862404704093933, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8062499761581421, "rewards/DrugCombCoverageCOTORM/std": 0.26196375489234924, "step": 6565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 465.25, "completions/min_length": 428.0, "epoch": 9.655882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.9123345613479614, "kl": 0.010611013043671846, "learning_rate": 6.162234717752417e-07, "loss": 0.00010597705841064453, "reward": 0.8767499923706055, "reward_std": 0.17010116577148438, "rewards/DrugCombAccuracyCOTORM/mean": 0.8537499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.31442803144454956, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.13437095284461975, "step": 6566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 437.8125, "completions/min_length": 404.0, "epoch": 9.657352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.016665054485201836, "kl": 0.009052134933881462, "learning_rate": 6.160986500752381e-07, "loss": 9.040314034791663e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 457.9375, "completions/min_length": 403.0, "epoch": 9.658823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.8846916556358337, "kl": 0.013223169138655066, "learning_rate": 6.15973820726949e-07, "loss": 0.00013149157166481018, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 6568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 389.5, "completions/min_length": 313.0, "epoch": 9.660294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.011739861220121384, "kl": 0.00911269977223128, "learning_rate": 6.158489837385982e-07, "loss": 9.095160203287378e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 462.875, "completions/min_length": 429.0, "epoch": 9.661764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.877048909664154, "kl": 0.010478603071533144, "learning_rate": 6.157241391184096e-07, "loss": 0.000104543287307024, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 440.75, "completions/min_length": 376.0, "epoch": 9.663235294117648, "frac_reward_zero_std": 0.0, "grad_norm": 1.2548846006393433, "kl": 0.009779191575944424, "learning_rate": 6.155992868746077e-07, "loss": 9.719282388687134e-05, "reward": 0.7000000476837158, "reward_std": 0.41403937339782715, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 453.9375, "completions/min_length": 382.0, "epoch": 9.66470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8623801469802856, "kl": 0.009571373811922967, "learning_rate": 6.154744270154171e-07, "loss": 9.577721357345581e-05, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 6572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 458.3125, "completions/min_length": 405.0, "epoch": 9.666176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.011732282117009163, "kl": 0.008004212286323309, "learning_rate": 6.153495595490635e-07, "loss": 7.999704394023865e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 454.5, "completions/min_length": 417.0, "epoch": 9.66764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.010355056263506413, "kl": 0.008192275185137987, "learning_rate": 6.152246844837729e-07, "loss": 8.136210817610845e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 6574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 413.3125, "completions/min_length": 355.0, "epoch": 9.669117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.9521611928939819, "kl": 0.008530697785317898, "learning_rate": 6.150998018277717e-07, "loss": 8.550361235393211e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 466.375, "completions/min_length": 383.0, "epoch": 9.670588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.7889332175254822, "kl": 0.009605441824533045, "learning_rate": 6.149749115892868e-07, "loss": 9.480863809585571e-05, "reward": 0.7589166760444641, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.7012500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.28052034974098206, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 6576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 479.1875, "completions/min_length": 409.0, "epoch": 9.672058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.210060477256775, "kl": 0.014400786953046918, "learning_rate": 6.148500137765457e-07, "loss": 0.00014373693556990474, "reward": 0.6875, "reward_std": 0.1941096931695938, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 6577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 430.0, "completions/min_length": 315.0, "epoch": 9.673529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.877034068107605, "kl": 0.011153403203934431, "learning_rate": 6.147251083977762e-07, "loss": 0.00011077741510234773, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 470.8125, "completions/min_length": 429.0, "epoch": 9.675, "frac_reward_zero_std": 1.0, "grad_norm": 0.012249508872628212, "kl": 0.007631979533471167, "learning_rate": 6.146001954612071e-07, "loss": 7.626684237038717e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 6579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 472.9375, "completions/min_length": 422.0, "epoch": 9.676470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 0.890042245388031, "kl": 0.011491924989968538, "learning_rate": 6.14475274975067e-07, "loss": 0.00011529773473739624, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 444.6875, "completions/min_length": 393.0, "epoch": 9.677941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.010614304803311825, "kl": 0.008907780051231384, "learning_rate": 6.143503469475856e-07, "loss": 8.895697828847915e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 464.875, "completions/min_length": 431.0, "epoch": 9.679411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.9059984087944031, "kl": 0.012507970910519361, "learning_rate": 6.142254113869927e-07, "loss": 0.00012422165309544653, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/mean_length": 418.375, "completions/min_length": 386.0, "epoch": 9.680882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.02634263038635254, "kl": 0.009362895507365465, "learning_rate": 6.141004683015187e-07, "loss": 9.432980004930869e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 484.375, "completions/min_length": 412.0, "epoch": 9.68235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.022069590166211128, "kl": 0.010037706000730395, "learning_rate": 6.139755176993947e-07, "loss": 9.989055979531258e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 452.8125, "completions/min_length": 393.0, "epoch": 9.683823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.01712379977107048, "kl": 0.008777474984526634, "learning_rate": 6.138505595888519e-07, "loss": 8.759608317632228e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 430.875, "completions/min_length": 368.0, "epoch": 9.685294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.020028136670589447, "kl": 0.00964879128150642, "learning_rate": 6.137255939781224e-07, "loss": 9.661228978075087e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 439.5, "completions/min_length": 371.0, "epoch": 9.686764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.009627144783735275, "kl": 0.00939849333371967, "learning_rate": 6.136006208754386e-07, "loss": 9.332512127002701e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 448.8125, "completions/min_length": 389.0, "epoch": 9.688235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 0.9876254200935364, "kl": 0.01222915283869952, "learning_rate": 6.134756402890334e-07, "loss": 0.00012067623174516484, "reward": 0.559374988079071, "reward_std": 0.04568428173661232, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 0.9375, "rewards/DrugCombCOTFormatORM/std": 0.17078252136707306, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.7187952995300293, "step": 6588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 462.0625, "completions/min_length": 409.0, "epoch": 9.689705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.012281287461519241, "kl": 0.00975548115093261, "learning_rate": 6.133506522271402e-07, "loss": 9.755232167663053e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/mean_length": 485.125, "completions/min_length": 423.0, "epoch": 9.691176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.8611192107200623, "kl": 0.008294941508211195, "learning_rate": 6.132256566979929e-07, "loss": 8.264929056167603e-05, "reward": 0.8034636974334717, "reward_std": 0.153117373585701, "rewards/DrugCombAccuracyCOTORM/mean": 0.769303560256958, "rewards/DrugCombAccuracyCOTORM/std": 0.338876336812973, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8802083730697632, "rewards/DrugCombCoverageCOTORM/std": 0.23564086854457855, "step": 6590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/mean_length": 507.1875, "completions/min_length": 350.0, "epoch": 9.69264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8094226717948914, "kl": 0.011707593919709325, "learning_rate": 6.131006537098258e-07, "loss": 0.00012047216296195984, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 472.75, "completions/min_length": 417.0, "epoch": 9.694117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.012449385598301888, "kl": 0.009982003131881356, "learning_rate": 6.129756432708738e-07, "loss": 0.00010116874182131141, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 483.375, "completions/min_length": 413.0, "epoch": 9.695588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.7906362414360046, "kl": 0.007689194055274129, "learning_rate": 6.128506253893724e-07, "loss": 7.684329466428608e-05, "reward": 0.8958333134651184, "reward_std": 0.11347680538892746, "rewards/DrugCombAccuracyCOTORM/mean": 0.8854166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.2083333432674408, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.22360680997371674, "step": 6593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 444.875, "completions/min_length": 399.0, "epoch": 9.697058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.0885330438613892, "kl": 0.00989695021416992, "learning_rate": 6.127256000735576e-07, "loss": 9.898096323013306e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 488.25, "completions/min_length": 400.0, "epoch": 9.698529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 0.9356943964958191, "kl": 0.010386705165728927, "learning_rate": 6.126005673316652e-07, "loss": 0.0001037493348121643, "reward": 0.590541660785675, "reward_std": 0.02312419004738331, "rewards/DrugCombAccuracyCOTORM/mean": 0.5103124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.5073147416114807, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8229166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.23935678601264954, "step": 6595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 450.0, "completions/min_length": 376.0, "epoch": 9.7, "frac_reward_zero_std": 1.0, "grad_norm": 0.0317232683300972, "kl": 0.010903707472607493, "learning_rate": 6.124755271719326e-07, "loss": 0.00010892070713452995, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 465.0, "completions/min_length": 417.0, "epoch": 9.701470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.01954645663499832, "kl": 0.009598368080332875, "learning_rate": 6.123504796025968e-07, "loss": 9.491266973782331e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/mean_length": 495.5625, "completions/min_length": 386.0, "epoch": 9.702941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.7817455530166626, "kl": 0.007909078150987625, "learning_rate": 6.122254246318956e-07, "loss": 7.985484262462705e-05, "reward": 0.5351666808128357, "reward_std": 0.10229478776454926, "rewards/DrugCombAccuracyCOTORM/mean": 0.4449999928474426, "rewards/DrugCombAccuracyCOTORM/std": 0.27909377217292786, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.2063797265291214, "step": 6598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 433.75, "completions/min_length": 374.0, "epoch": 9.704411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.02059338055551052, "kl": 0.01220237696543336, "learning_rate": 6.121003622680677e-07, "loss": 0.00012221175711601973, "reward": 0.8416666984558105, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.8333333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.17213258147239685, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 6599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 447.1875, "completions/min_length": 373.0, "epoch": 9.705882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.012671560049057007, "kl": 0.011741833295673132, "learning_rate": 6.119752925193516e-07, "loss": 0.00011741770140361041, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 437.3125, "completions/min_length": 348.0, "epoch": 9.70735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.009155934676527977, "kl": 0.007652883883565664, "learning_rate": 6.118502153939865e-07, "loss": 7.646858284715563e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 484.8125, "completions/min_length": 439.0, "epoch": 9.708823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.01558193564414978, "kl": 0.012145700631663203, "learning_rate": 6.117251309002123e-07, "loss": 0.00012097114085918292, "reward": 0.6713333129882812, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6100000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.40279027819633484, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 6602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 459.8125, "completions/min_length": 392.0, "epoch": 9.71029411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9844799637794495, "kl": 0.009599861921742558, "learning_rate": 6.116000390462691e-07, "loss": 9.558031888445839e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/mean_length": 481.125, "completions/min_length": 433.0, "epoch": 9.711764705882352, "frac_reward_zero_std": 0.0, "grad_norm": 1.674950122833252, "kl": 0.009685783879831433, "learning_rate": 6.114749398403979e-07, "loss": 9.647011756896973e-05, "reward": 0.7195416688919067, "reward_std": 0.2740638554096222, "rewards/DrugCombAccuracyCOTORM/mean": 0.6741666793823242, "rewards/DrugCombAccuracyCOTORM/std": 0.36968156695365906, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8020833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.2916666865348816, "step": 6604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 484.1875, "completions/min_length": 418.0, "epoch": 9.713235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.014091783203184605, "kl": 0.010324024013243616, "learning_rate": 6.113498332908397e-07, "loss": 0.00010326842311769724, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 423.5625, "completions/min_length": 387.0, "epoch": 9.714705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.1098952293395996, "kl": 0.011917467578314245, "learning_rate": 6.112247194058364e-07, "loss": 0.00011885710409842432, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 449.125, "completions/min_length": 403.0, "epoch": 9.716176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.020936908200383186, "kl": 0.009788450668565929, "learning_rate": 6.110995981936299e-07, "loss": 9.813466749619693e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 450.375, "completions/min_length": 405.0, "epoch": 9.717647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.04240815341472626, "kl": 0.009367990191094577, "learning_rate": 6.10974469662463e-07, "loss": 9.34591080294922e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 440.5, "completions/min_length": 371.0, "epoch": 9.719117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.9543272256851196, "kl": 0.011322895647026598, "learning_rate": 6.10849333820579e-07, "loss": 0.00011280587932560593, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 6609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 441.125, "completions/min_length": 386.0, "epoch": 9.720588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.024528371170163155, "kl": 0.011911514913663268, "learning_rate": 6.107241906762214e-07, "loss": 0.00011963867291342467, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 450.4375, "completions/min_length": 376.0, "epoch": 9.722058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.01207113079726696, "kl": 0.008681889739818871, "learning_rate": 6.105990402376343e-07, "loss": 8.73283643159084e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 430.4375, "completions/min_length": 366.0, "epoch": 9.723529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.9701960682868958, "kl": 0.009518901817500591, "learning_rate": 6.104738825130624e-07, "loss": 9.553134441375732e-05, "reward": 0.893750011920929, "reward_std": 0.16569657623767853, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.28867512941360474, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 462.8125, "completions/min_length": 409.0, "epoch": 9.725, "frac_reward_zero_std": 0.5, "grad_norm": 0.9226978421211243, "kl": 0.007934154709801078, "learning_rate": 6.103487175107507e-07, "loss": 8.00356428953819e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 431.0, "completions/min_length": 372.0, "epoch": 9.726470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.029977522790431976, "kl": 0.010995657881721854, "learning_rate": 6.102235452389446e-07, "loss": 0.00011005339911207557, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 669.0, "completions/mean_length": 540.6875, "completions/min_length": 440.0, "epoch": 9.727941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 0.9033161997795105, "kl": 0.009462183341383934, "learning_rate": 6.100983657058904e-07, "loss": 9.518861770629883e-05, "reward": 0.8802083730697632, "reward_std": 0.10019201785326004, "rewards/DrugCombAccuracyCOTORM/mean": 0.8541666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.22669117152690887, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 6615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 423.625, "completions/min_length": 382.0, "epoch": 9.729411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.019321754574775696, "kl": 0.009653004352003336, "learning_rate": 6.099731789198344e-07, "loss": 9.689714352134615e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 453.0, "completions/min_length": 375.0, "epoch": 9.730882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.017514705657958984, "kl": 0.008299051085487008, "learning_rate": 6.098479848890237e-07, "loss": 8.275738218799233e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 465.25, "completions/min_length": 378.0, "epoch": 9.73235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.6610836982727051, "kl": 0.01048503560014069, "learning_rate": 6.097227836217058e-07, "loss": 0.0001040041825035587, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 438.125, "completions/min_length": 401.0, "epoch": 9.733823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.012449858710169792, "kl": 0.008011088939383626, "learning_rate": 6.095975751261285e-07, "loss": 8.020684617804363e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 423.4375, "completions/min_length": 335.0, "epoch": 9.735294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 1.0999071598052979, "kl": 0.014265678124502301, "learning_rate": 6.094723594105403e-07, "loss": 0.0001398622989654541, "reward": 0.5428333282470703, "reward_std": 0.07931191474199295, "rewards/DrugCombAccuracyCOTORM/mean": 0.5274999737739563, "rewards/DrugCombAccuracyCOTORM/std": 0.49293002486228943, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.2083333432674408, "rewards/DrugCombCoverageCOTORM/std": 0.9727776646614075, "step": 6620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 498.3125, "completions/min_length": 440.0, "epoch": 9.736764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9964133501052856, "kl": 0.01007781527005136, "learning_rate": 6.093471364831902e-07, "loss": 0.00010197237133979797, "reward": 0.637499988079071, "reward_std": 0.1505940705537796, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/mean_length": 508.4375, "completions/min_length": 399.0, "epoch": 9.738235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.008556398563086987, "kl": 0.00713377189822495, "learning_rate": 6.092219063523274e-07, "loss": 7.11431130184792e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 6622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/mean_length": 425.5, "completions/min_length": 372.0, "epoch": 9.739705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.027788907289505005, "kl": 0.010423369705677032, "learning_rate": 6.090966690262019e-07, "loss": 0.00010409406968392432, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/mean_length": 607.75, "completions/min_length": 535.0, "epoch": 9.741176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.1967166662216187, "kl": 0.008144958643242717, "learning_rate": 6.089714245130639e-07, "loss": 8.160248398780823e-05, "reward": 0.7037271857261658, "reward_std": 0.17752264440059662, "rewards/DrugCombAccuracyCOTORM/mean": 0.6460652351379395, "rewards/DrugCombAccuracyCOTORM/std": 0.3319319486618042, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8687499761581421, "rewards/DrugCombCoverageCOTORM/std": 0.2548692524433136, "step": 6624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 512.5625, "completions/min_length": 468.0, "epoch": 9.742647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.9529567956924438, "kl": 0.012025111704133451, "learning_rate": 6.088461728211641e-07, "loss": 0.00012011826038360596, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 479.4375, "completions/min_length": 433.0, "epoch": 9.744117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.7757231593132019, "kl": 0.009428300196304917, "learning_rate": 6.08720913958754e-07, "loss": 9.382650750922039e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/mean_length": 511.8125, "completions/min_length": 443.0, "epoch": 9.745588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.3076695203781128, "kl": 0.012524057412520051, "learning_rate": 6.085956479340852e-07, "loss": 0.00012533366680145264, "reward": 0.7593749761581421, "reward_std": 0.3784170150756836, "rewards/DrugCombAccuracyCOTORM/mean": 0.71875, "rewards/DrugCombAccuracyCOTORM/std": 0.44604745507240295, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.5072392821311951, "step": 6627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 465.125, "completions/min_length": 377.0, "epoch": 9.74705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.0107211172580719, "kl": 0.0077586001716554165, "learning_rate": 6.084703747554101e-07, "loss": 7.790526433382183e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 473.8125, "completions/min_length": 438.0, "epoch": 9.748529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.0554736852645874, "kl": 0.009730233927257359, "learning_rate": 6.083450944309811e-07, "loss": 9.824381413636729e-05, "reward": 0.762499988079071, "reward_std": 0.25599944591522217, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 6629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 447.8125, "completions/min_length": 404.0, "epoch": 9.75, "frac_reward_zero_std": 0.5, "grad_norm": 0.95964115858078, "kl": 0.011959767201915383, "learning_rate": 6.082198069690514e-07, "loss": 0.00011956319212913513, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 6630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 428.125, "completions/min_length": 384.0, "epoch": 9.751470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.0172220878303051, "kl": 0.009345055907033384, "learning_rate": 6.080945123778748e-07, "loss": 9.351566404802725e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/mean_length": 536.8125, "completions/min_length": 436.0, "epoch": 9.75294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.495316982269287, "kl": 0.012244317913427949, "learning_rate": 6.079692106657052e-07, "loss": 0.00012101978063583374, "reward": 0.8375000357627869, "reward_std": 0.2384980022907257, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.2713136672973633, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 490.4375, "completions/min_length": 427.0, "epoch": 9.754411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.8791830539703369, "kl": 0.010225481004454195, "learning_rate": 6.078439018407971e-07, "loss": 0.00010241257405141369, "reward": 0.675000011920929, "reward_std": 0.20528726279735565, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 6633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 447.9375, "completions/min_length": 401.0, "epoch": 9.755882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.010893573053181171, "kl": 0.008270775550045073, "learning_rate": 6.077185859114059e-07, "loss": 8.253469422925264e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 422.5, "completions/min_length": 366.0, "epoch": 9.757352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.014016740955412388, "kl": 0.008101481129415333, "learning_rate": 6.075932628857869e-07, "loss": 8.048395102377981e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 462.0625, "completions/min_length": 382.0, "epoch": 9.758823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.8713764548301697, "kl": 0.010711544658988714, "learning_rate": 6.074679327721958e-07, "loss": 0.00010710809146985412, "reward": 0.7562500238418579, "reward_std": 0.2610931694507599, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.8139410614967346, "step": 6636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 669.0, "completions/mean_length": 497.9375, "completions/min_length": 365.0, "epoch": 9.760294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 1.4785977602005005, "kl": 0.027195788105018437, "learning_rate": 6.073425955788893e-07, "loss": 0.0002601617306936532, "reward": 0.6962291598320007, "reward_std": 0.1498546004295349, "rewards/DrugCombAccuracyCOTORM/mean": 0.6619530916213989, "rewards/DrugCombAccuracyCOTORM/std": 0.417027086019516, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.523520827293396, "step": 6637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/mean_length": 401.6875, "completions/min_length": 353.0, "epoch": 9.761764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.009160604327917099, "kl": 0.007696392014622688, "learning_rate": 6.072172513141244e-07, "loss": 7.733084203209728e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 436.875, "completions/min_length": 372.0, "epoch": 9.763235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.012532178312540054, "kl": 0.009004556224681437, "learning_rate": 6.070918999861581e-07, "loss": 8.980065467767417e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/mean_length": 479.0, "completions/min_length": 395.0, "epoch": 9.764705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 0.83880615234375, "kl": 0.01305719988886267, "learning_rate": 6.069665416032486e-07, "loss": 0.00013136863708496094, "reward": 0.6923666596412659, "reward_std": 0.12124417722225189, "rewards/DrugCombAccuracyCOTORM/mean": 0.6274374723434448, "rewards/DrugCombAccuracyCOTORM/std": 0.4367406666278839, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9041666388511658, "rewards/DrugCombCoverageCOTORM/std": 0.11013460159301758, "step": 6640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 457.0, "completions/min_length": 394.0, "epoch": 9.766176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.01631022058427334, "kl": 0.007227154448628426, "learning_rate": 6.068411761736542e-07, "loss": 7.255468517541885e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 520.9375, "completions/min_length": 458.0, "epoch": 9.76764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.009240580722689629, "kl": 0.008213969296775758, "learning_rate": 6.067158037056333e-07, "loss": 8.210909436456859e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 454.9375, "completions/min_length": 391.0, "epoch": 9.769117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.8545779585838318, "kl": 0.010280261980369687, "learning_rate": 6.065904242074452e-07, "loss": 0.00010258331894874573, "reward": 0.7151666879653931, "reward_std": 0.2424522191286087, "rewards/DrugCombAccuracyCOTORM/mean": 0.7012500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.46046173572540283, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5416666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 6643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/mean_length": 494.8125, "completions/min_length": 381.0, "epoch": 9.770588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.00772574869915843, "kl": 0.005830965819768608, "learning_rate": 6.064650376873498e-07, "loss": 5.8015662943944335e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 428.4375, "completions/min_length": 339.0, "epoch": 9.772058823529411, "frac_reward_zero_std": 0.0, "grad_norm": 1.8429149389266968, "kl": 0.01856259210035205, "learning_rate": 6.063396441536071e-07, "loss": 0.0001862645149230957, "reward": 0.7945833206176758, "reward_std": 0.3222442865371704, "rewards/DrugCombAccuracyCOTORM/mean": 0.7562500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.3733965754508972, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.15957117080688477, "step": 6645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 422.6875, "completions/min_length": 386.0, "epoch": 9.773529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.12678861618042, "kl": 0.016056942637078464, "learning_rate": 6.062142436144779e-07, "loss": 0.00016257278912235051, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 507.1875, "completions/min_length": 433.0, "epoch": 9.775, "frac_reward_zero_std": 0.0, "grad_norm": 1.2560486793518066, "kl": 0.014533650130033493, "learning_rate": 6.060888360782231e-07, "loss": 0.00014643371105194092, "reward": 0.8350000381469727, "reward_std": 0.24039798974990845, "rewards/DrugCombAccuracyCOTORM/mean": 0.8041666746139526, "rewards/DrugCombAccuracyCOTORM/std": 0.3324154019355774, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.14907118678092957, "step": 6647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/mean_length": 512.875, "completions/min_length": 433.0, "epoch": 9.776470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.802487850189209, "kl": 0.01854186225682497, "learning_rate": 6.059634215531042e-07, "loss": 0.00018885917961597443, "reward": 0.9562301635742188, "reward_std": 0.02713252790272236, "rewards/DrugCombAccuracyCOTORM/mean": 0.9484127163887024, "rewards/DrugCombAccuracyCOTORM/std": 0.06918887048959732, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9750000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.10000000149011612, "step": 6648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 452.0625, "completions/min_length": 403.0, "epoch": 9.777941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9729143381118774, "kl": 0.012346544302999973, "learning_rate": 6.058380000473833e-07, "loss": 0.00012265713303349912, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 6649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 755.0, "completions/mean_length": 559.0, "completions/min_length": 435.0, "epoch": 9.779411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.8917107582092285, "kl": 0.009726068703457713, "learning_rate": 6.057125715693228e-07, "loss": 9.805150330066681e-05, "reward": 0.7880291938781738, "reward_std": 0.1429351270198822, "rewards/DrugCombAccuracyCOTORM/mean": 0.7411562204360962, "rewards/DrugCombAccuracyCOTORM/std": 0.3588314354419708, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9510416388511658, "rewards/DrugCombCoverageCOTORM/std": 0.07828456908464432, "step": 6650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/mean_length": 393.625, "completions/min_length": 336.0, "epoch": 9.780882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.01835736259818077, "kl": 0.00940421933773905, "learning_rate": 6.055871361271855e-07, "loss": 9.431327634956688e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/mean_length": 514.0625, "completions/min_length": 452.0, "epoch": 9.782352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.0942742824554443, "kl": 0.012297590030357242, "learning_rate": 6.054616937292349e-07, "loss": 0.00012417417019605637, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 454.5625, "completions/min_length": 387.0, "epoch": 9.783823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.043741051107645035, "kl": 0.011427167570218444, "learning_rate": 6.053362443837349e-07, "loss": 0.00011394838656997308, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 456.75, "completions/min_length": 415.0, "epoch": 9.785294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.0566446781158447, "kl": 0.010726056760177016, "learning_rate": 6.052107880989497e-07, "loss": 0.00010763108730316162, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 6654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 425.0625, "completions/min_length": 379.0, "epoch": 9.786764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0853893756866455, "kl": 0.01953017502091825, "learning_rate": 6.050853248831439e-07, "loss": 0.0001846708619268611, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 445.5625, "completions/min_length": 393.0, "epoch": 9.788235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.013637196272611618, "kl": 0.010289471596479416, "learning_rate": 6.049598547445829e-07, "loss": 0.00010265929449815303, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 442.875, "completions/min_length": 392.0, "epoch": 9.78970588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.03614232316613197, "kl": 0.011806295020505786, "learning_rate": 6.048343776915323e-07, "loss": 0.00011824421380879357, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/mean_length": 483.0, "completions/min_length": 347.0, "epoch": 9.791176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.5190110206604004, "kl": 0.012029603007249534, "learning_rate": 6.047088937322582e-07, "loss": 0.00012021884322166443, "reward": 0.6189791560173035, "reward_std": 0.09366151690483093, "rewards/DrugCombAccuracyCOTORM/mean": 0.5679947733879089, "rewards/DrugCombAccuracyCOTORM/std": 0.46993428468704224, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6458333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 6658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 494.75, "completions/min_length": 435.0, "epoch": 9.79264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.046438217163086, "kl": 0.014457492274232209, "learning_rate": 6.045834028750273e-07, "loss": 0.00014292112609837204, "reward": 0.8999999761581421, "reward_std": 0.10690449178218842, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.22360680997371674, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 418.1875, "completions/min_length": 374.0, "epoch": 9.794117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.05386710539460182, "kl": 0.011748184682801366, "learning_rate": 6.044579051281062e-07, "loss": 0.00011998417176073417, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 438.8125, "completions/min_length": 393.0, "epoch": 9.795588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.0458013191819191, "kl": 0.014343652175739408, "learning_rate": 6.043324004997629e-07, "loss": 0.0001453342556487769, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 439.5625, "completions/min_length": 388.0, "epoch": 9.797058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.01993970200419426, "kl": 0.010978335165418684, "learning_rate": 6.04206888998265e-07, "loss": 0.00010961142834275961, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/mean_length": 534.8125, "completions/min_length": 396.0, "epoch": 9.798529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.4807392358779907, "kl": 0.01326122647151351, "learning_rate": 6.040813706318809e-07, "loss": 0.00013542920351028442, "reward": 0.6421874761581421, "reward_std": 0.49385982751846313, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.8139410614967346, "step": 6663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 480.1875, "completions/min_length": 432.0, "epoch": 9.8, "frac_reward_zero_std": 1.0, "grad_norm": 0.015657927840948105, "kl": 0.010216036811470985, "learning_rate": 6.039558454088795e-07, "loss": 0.00010256657697027549, "reward": 0.7666666507720947, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.3442651927471161, "step": 6664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/mean_length": 463.8125, "completions/min_length": 350.0, "epoch": 9.801470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 0.9432827830314636, "kl": 0.009080032701604068, "learning_rate": 6.038303133375303e-07, "loss": 9.179487824440002e-05, "reward": 0.9302083253860474, "reward_std": 0.144470676779747, "rewards/DrugCombAccuracyCOTORM/mean": 0.9166666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 6665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 439.875, "completions/min_length": 377.0, "epoch": 9.802941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.017900027334690094, "kl": 0.00934730307199061, "learning_rate": 6.037047744261027e-07, "loss": 9.300503734266385e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 485.625, "completions/min_length": 415.0, "epoch": 9.804411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.01112943422049284, "kl": 0.01511152065359056, "learning_rate": 6.035792286828669e-07, "loss": 0.00015084528422448784, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 489.375, "completions/min_length": 419.0, "epoch": 9.805882352941177, "frac_reward_zero_std": 0.0, "grad_norm": 1.3384474515914917, "kl": 0.013550907140597701, "learning_rate": 6.034536761160938e-07, "loss": 0.00013640522956848145, "reward": 0.737500011920929, "reward_std": 0.3709394931793213, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 459.875, "completions/min_length": 401.0, "epoch": 9.80735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9483063220977783, "kl": 0.013926741434261203, "learning_rate": 6.033281167340541e-07, "loss": 0.0001390613615512848, "reward": 0.4312500059604645, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 6669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 436.375, "completions/min_length": 395.0, "epoch": 9.808823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.8705754280090332, "kl": 0.00853624171577394, "learning_rate": 6.032025505450198e-07, "loss": 8.528679609298706e-05, "reward": 0.7124166488647461, "reward_std": 0.11620121449232101, "rewards/DrugCombAccuracyCOTORM/mean": 0.6587499976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.3996310830116272, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.17078250646591187, "step": 6670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 406.125, "completions/min_length": 354.0, "epoch": 9.810294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.012689128518104553, "kl": 0.00786565092857927, "learning_rate": 6.030769775572626e-07, "loss": 7.893570000305772e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/mean_length": 529.1875, "completions/min_length": 435.0, "epoch": 9.811764705882354, "frac_reward_zero_std": 0.0, "grad_norm": 1.4657868146896362, "kl": 0.02548368158750236, "learning_rate": 6.029513977790549e-07, "loss": 0.0002535879611968994, "reward": 0.43791958689689636, "reward_std": 0.2856994569301605, "rewards/DrugCombAccuracyCOTORM/mean": 0.36146196722984314, "rewards/DrugCombAccuracyCOTORM/std": 0.44979187846183777, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.48750001192092896, "rewards/DrugCombCoverageCOTORM/std": 0.5340099930763245, "step": 6672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 474.9375, "completions/min_length": 392.0, "epoch": 9.813235294117646, "frac_reward_zero_std": 0.0, "grad_norm": 1.0810574293136597, "kl": 0.00906012812629342, "learning_rate": 6.028258112186697e-07, "loss": 9.042024612426758e-05, "reward": 0.8937499523162842, "reward_std": 0.3005203604698181, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 479.0, "completions/min_length": 384.0, "epoch": 9.814705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9214267134666443, "kl": 0.015691568376496434, "learning_rate": 6.027002178843802e-07, "loss": 0.00015625357627868652, "reward": 0.2699500024318695, "reward_std": 0.1333603411912918, "rewards/DrugCombAccuracyCOTORM/mean": 0.12024999409914017, "rewards/DrugCombAccuracyCOTORM/std": 0.24366740882396698, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.737500011920929, "rewards/DrugCombCoverageCOTORM/std": 0.30740854144096375, "step": 6674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 458.875, "completions/min_length": 405.0, "epoch": 9.816176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.7624263763427734, "kl": 0.01137719419784844, "learning_rate": 6.025746177844603e-07, "loss": 0.00011406703561078757, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 6675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 508.0625, "completions/min_length": 381.0, "epoch": 9.81764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8545649647712708, "kl": 0.013796296902000904, "learning_rate": 6.024490109271841e-07, "loss": 0.00013815611600875854, "reward": 0.6468750238418579, "reward_std": 0.14274363219738007, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.06718549132347107, "step": 6676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 452.625, "completions/min_length": 408.0, "epoch": 9.819117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.026551604270935, "kl": 0.01338346884585917, "learning_rate": 6.023233973208266e-07, "loss": 0.0001323143660556525, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 436.6875, "completions/min_length": 382.0, "epoch": 9.820588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.015260311774909496, "kl": 0.011939306859858334, "learning_rate": 6.021977769736624e-07, "loss": 0.00011900983372470364, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 466.5625, "completions/min_length": 415.0, "epoch": 9.822058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.018762243911623955, "kl": 0.012222741963341832, "learning_rate": 6.020721498939673e-07, "loss": 0.00012188385153422132, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 475.0, "completions/min_length": 424.0, "epoch": 9.823529411764707, "frac_reward_zero_std": 0.0, "grad_norm": 1.3841356039047241, "kl": 0.01652116933837533, "learning_rate": 6.019465160900172e-07, "loss": 0.00016567111015319824, "reward": 0.710812509059906, "reward_std": 0.3178586959838867, "rewards/DrugCombAccuracyCOTORM/mean": 0.6404687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.48291152715682983, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.0625, "step": 6680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 444.6875, "completions/min_length": 394.0, "epoch": 9.825, "frac_reward_zero_std": 1.0, "grad_norm": 0.008494469337165356, "kl": 0.00730123755056411, "learning_rate": 6.018208755700887e-07, "loss": 7.307935447897762e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/mean_length": 489.75, "completions/min_length": 402.0, "epoch": 9.826470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.013031370006501675, "kl": 0.00877995858900249, "learning_rate": 6.016952283424585e-07, "loss": 8.800932846497744e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 430.625, "completions/min_length": 401.0, "epoch": 9.827941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.022254016250371933, "kl": 0.010091592324897647, "learning_rate": 6.015695744154043e-07, "loss": 0.00010008471872424707, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 500.8125, "completions/min_length": 454.0, "epoch": 9.829411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.285046100616455, "kl": 0.012639330816455185, "learning_rate": 6.014439137972034e-07, "loss": 0.000128374551422894, "reward": 0.925000011920929, "reward_std": 0.1035098284482956, "rewards/DrugCombAccuracyCOTORM/mean": 0.90625, "rewards/DrugCombAccuracyCOTORM/std": 0.20155644416809082, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 476.0, "completions/min_length": 400.0, "epoch": 9.830882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 1.686194658279419, "kl": 0.012103404384106398, "learning_rate": 6.013182464961341e-07, "loss": 0.00012103468179702759, "reward": 0.8999999761581421, "reward_std": 0.2104278802871704, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.2687419056892395, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/mean_length": 468.5625, "completions/min_length": 375.0, "epoch": 9.83235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1158493757247925, "kl": 0.013475270010530949, "learning_rate": 6.011925725204752e-07, "loss": 0.00013716948160436004, "reward": 0.6814166307449341, "reward_std": 0.1315235048532486, "rewards/DrugCombAccuracyCOTORM/mean": 0.6278125047683716, "rewards/DrugCombAccuracyCOTORM/std": 0.4380371868610382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.24720662832260132, "step": 6686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 430.5, "completions/min_length": 366.0, "epoch": 9.833823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.015245583839714527, "kl": 0.011007494758814573, "learning_rate": 6.010668918785056e-07, "loss": 0.00011008571891579777, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 436.9375, "completions/min_length": 397.0, "epoch": 9.83529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.008869832381606102, "kl": 0.00818654999602586, "learning_rate": 6.009412045785051e-07, "loss": 8.19335546111688e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 458.8125, "completions/min_length": 423.0, "epoch": 9.836764705882352, "frac_reward_zero_std": 0.5, "grad_norm": 1.0497642755508423, "kl": 0.009472223464399576, "learning_rate": 6.008155106287535e-07, "loss": 9.45962528930977e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 467.9375, "completions/min_length": 418.0, "epoch": 9.838235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.4105249643325806, "kl": 0.01567357056774199, "learning_rate": 6.006898100375311e-07, "loss": 0.0001561492681503296, "reward": 0.40625, "reward_std": 0.3729080259799957, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 6690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 482.9375, "completions/min_length": 427.0, "epoch": 9.839705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.00952125433832407, "kl": 0.007695451262407005, "learning_rate": 6.005641028131187e-07, "loss": 7.713597733527422e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/mean_length": 509.5625, "completions/min_length": 393.0, "epoch": 9.841176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.02092534676194191, "kl": 0.011399577371776104, "learning_rate": 6.004383889637979e-07, "loss": 0.00011425621050875634, "reward": 0.8666666746139526, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.8333333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.17213258147239685, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 441.0625, "completions/min_length": 398.0, "epoch": 9.842647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.297158122062683, "kl": 0.014071544399484992, "learning_rate": 6.003126684978502e-07, "loss": 0.0001411767880199477, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 493.75, "completions/min_length": 438.0, "epoch": 9.844117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.014339890331029892, "kl": 0.009803023654967546, "learning_rate": 6.001869414235576e-07, "loss": 9.85823426162824e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 436.875, "completions/min_length": 389.0, "epoch": 9.845588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9015697240829468, "kl": 0.011963461758568883, "learning_rate": 6.00061207749203e-07, "loss": 0.00012002494622720405, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/mean_length": 523.1875, "completions/min_length": 434.0, "epoch": 9.847058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.4250198602676392, "kl": 0.013462495990097523, "learning_rate": 5.999354674830693e-07, "loss": 0.00013568997383117676, "reward": 0.7992864847183228, "reward_std": 0.36066877841949463, "rewards/DrugCombAccuracyCOTORM/mean": 0.768151044845581, "rewards/DrugCombAccuracyCOTORM/std": 0.39938387274742126, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84765625, "rewards/DrugCombCoverageCOTORM/std": 0.4994626045227051, "step": 6696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 463.0625, "completions/min_length": 394.0, "epoch": 9.848529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.8369297981262207, "kl": 0.008629192365333438, "learning_rate": 5.998097206334398e-07, "loss": 8.715316653251648e-05, "reward": 0.8698333501815796, "reward_std": 0.028292685747146606, "rewards/DrugCombAccuracyCOTORM/mean": 0.8477083444595337, "rewards/DrugCombAccuracyCOTORM/std": 0.16454075276851654, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.08606630563735962, "step": 6697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 472.125, "completions/min_length": 405.0, "epoch": 9.85, "frac_reward_zero_std": 0.5, "grad_norm": 1.1681163311004639, "kl": 0.010279768495820463, "learning_rate": 5.996839672085986e-07, "loss": 0.00010288762132404372, "reward": 0.6131874918937683, "reward_std": 0.03215913102030754, "rewards/DrugCombAccuracyCOTORM/mean": 0.5223437547683716, "rewards/DrugCombAccuracyCOTORM/std": 0.49718332290649414, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.953125, "rewards/DrugCombCoverageCOTORM/std": 0.10077822208404541, "step": 6698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 444.875, "completions/min_length": 385.0, "epoch": 9.851470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.01718912646174431, "kl": 0.009449254721403122, "learning_rate": 5.995582072168298e-07, "loss": 9.514747944194824e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 449.25, "completions/min_length": 405.0, "epoch": 9.852941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.012286096811294556, "kl": 0.0069465304259210825, "learning_rate": 5.994324406664183e-07, "loss": 6.94960035616532e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 467.4375, "completions/min_length": 380.0, "epoch": 9.854411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.100066900253296, "kl": 0.01658567413687706, "learning_rate": 5.993066675656493e-07, "loss": 0.00016596761997789145, "reward": 0.6153749823570251, "reward_std": 0.08713853359222412, "rewards/DrugCombAccuracyCOTORM/mean": 0.5621874928474426, "rewards/DrugCombAccuracyCOTORM/std": 0.4672471582889557, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.65625, "rewards/DrugCombCoverageCOTORM/std": 0.4366062581539154, "step": 6701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 457.5625, "completions/min_length": 378.0, "epoch": 9.855882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.025808127596974373, "kl": 0.012046269839629531, "learning_rate": 5.991808879228082e-07, "loss": 0.00012245668040122837, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 431.5625, "completions/min_length": 372.0, "epoch": 9.85735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.010551552288234234, "kl": 0.007497695158235729, "learning_rate": 5.990551017461814e-07, "loss": 7.514465687563643e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 428.4375, "completions/min_length": 390.0, "epoch": 9.858823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.9593312740325928, "kl": 0.011073867557570338, "learning_rate": 5.989293090440549e-07, "loss": 0.00010994123294949532, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 477.25, "completions/min_length": 409.0, "epoch": 9.860294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.71024090051651, "kl": 0.009714911808259785, "learning_rate": 5.988035098247161e-07, "loss": 9.7886826551985e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 473.4375, "completions/min_length": 393.0, "epoch": 9.861764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.014306548051536083, "kl": 0.009364913450554013, "learning_rate": 5.986777040964521e-07, "loss": 9.34523850446567e-05, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 6706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 435.9375, "completions/min_length": 369.0, "epoch": 9.863235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8541314005851746, "kl": 0.009902090765535831, "learning_rate": 5.985518918675506e-07, "loss": 9.88990068435669e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 463.8125, "completions/min_length": 417.0, "epoch": 9.864705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.009188061580061913, "kl": 0.007397467619739473, "learning_rate": 5.984260731462999e-07, "loss": 7.435782754328102e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 495.9375, "completions/min_length": 454.0, "epoch": 9.866176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.017055092379450798, "kl": 0.010724837658926845, "learning_rate": 5.983002479409886e-07, "loss": 0.00010744227620307356, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 762.0, "completions/mean_length": 589.3125, "completions/min_length": 447.0, "epoch": 9.867647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.1597020626068115, "kl": 0.013763158698566258, "learning_rate": 5.981744162599056e-07, "loss": 0.00013904015941079706, "reward": 0.574999988079071, "reward_std": 0.030860669910907745, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.49441322684288025, "step": 6710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 443.0, "completions/min_length": 387.0, "epoch": 9.869117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.0330147743225098, "kl": 0.010840907925739884, "learning_rate": 5.980485781113405e-07, "loss": 0.00010856986045837402, "reward": 0.7961000204086304, "reward_std": 0.17062945663928986, "rewards/DrugCombAccuracyCOTORM/mean": 0.7576249837875366, "rewards/DrugCombAccuracyCOTORM/std": 0.37253955006599426, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8999999761581421, "rewards/DrugCombCoverageCOTORM/std": 0.17888544499874115, "step": 6711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 435.8125, "completions/min_length": 391.0, "epoch": 9.870588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.015977591276168823, "kl": 0.009451285237446427, "learning_rate": 5.979227335035834e-07, "loss": 9.44634375628084e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/mean_length": 427.4375, "completions/min_length": 360.0, "epoch": 9.87205882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.026278844103217125, "kl": 0.012298682471737266, "learning_rate": 5.977968824449244e-07, "loss": 0.000122391851618886, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 472.1875, "completions/min_length": 409.0, "epoch": 9.873529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.2475963830947876, "kl": 0.013018935918807983, "learning_rate": 5.976710249436542e-07, "loss": 0.00012998496822547168, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 480.625, "completions/min_length": 432.0, "epoch": 9.875, "frac_reward_zero_std": 0.5, "grad_norm": 0.9771767854690552, "kl": 0.01601046323776245, "learning_rate": 5.975451610080642e-07, "loss": 0.0001596580259501934, "reward": 0.9663333296775818, "reward_std": 0.06375472992658615, "rewards/DrugCombAccuracyCOTORM/mean": 0.96833336353302, "rewards/DrugCombAccuracyCOTORM/std": 0.08652980625629425, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.2918649911880493, "step": 6715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/mean_length": 500.125, "completions/min_length": 437.0, "epoch": 9.876470588235295, "frac_reward_zero_std": 0.0, "grad_norm": 1.4601860046386719, "kl": 0.01894818781875074, "learning_rate": 5.974192906464457e-07, "loss": 0.00018968433141708374, "reward": 0.875, "reward_std": 0.2558746933937073, "rewards/DrugCombAccuracyCOTORM/mean": 0.84375, "rewards/DrugCombAccuracyCOTORM/std": 0.3520771861076355, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 479.1875, "completions/min_length": 446.0, "epoch": 9.87794117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.02490738406777382, "kl": 0.009036187082529068, "learning_rate": 5.972934138670909e-07, "loss": 9.07228677533567e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 447.0, "completions/min_length": 413.0, "epoch": 9.879411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.015817122533917427, "kl": 0.010744222439825535, "learning_rate": 5.971675306782922e-07, "loss": 0.00010751454101409763, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 433.375, "completions/min_length": 380.0, "epoch": 9.880882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.0670788288116455, "kl": 0.009500792599283159, "learning_rate": 5.970416410883425e-07, "loss": 9.544434578856453e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 448.3125, "completions/min_length": 402.0, "epoch": 9.882352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.010988695546984673, "kl": 0.008529719663783908, "learning_rate": 5.96915745105535e-07, "loss": 8.53110323077999e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 448.5625, "completions/min_length": 357.0, "epoch": 9.883823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.013100915588438511, "kl": 0.010650807642377913, "learning_rate": 5.967898427381635e-07, "loss": 0.00010688541078707203, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/mean_length": 410.375, "completions/min_length": 368.0, "epoch": 9.885294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.00722694443538785, "kl": 0.007137035485357046, "learning_rate": 5.966639339945222e-07, "loss": 7.181117689469829e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/mean_length": 469.25, "completions/min_length": 375.0, "epoch": 9.886764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9284161329269409, "kl": 0.011764923227019608, "learning_rate": 5.965380188829054e-07, "loss": 0.00011880975216627121, "reward": 0.7048515677452087, "reward_std": 0.1702972650527954, "rewards/DrugCombAccuracyCOTORM/mean": 0.6633561849594116, "rewards/DrugCombAccuracyCOTORM/std": 0.4514012634754181, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7416666746139526, "rewards/DrugCombCoverageCOTORM/std": 0.3309917747974396, "step": 6723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 446.25, "completions/min_length": 387.0, "epoch": 9.888235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9110260009765625, "kl": 0.009418805362656713, "learning_rate": 5.964120974116084e-07, "loss": 9.413760562893003e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 6724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 501.1875, "completions/min_length": 423.0, "epoch": 9.889705882352942, "frac_reward_zero_std": 0.0, "grad_norm": 1.8468554019927979, "kl": 0.014565062941983342, "learning_rate": 5.962861695889263e-07, "loss": 0.0001461617648601532, "reward": 0.5552083253860474, "reward_std": 0.3353438377380371, "rewards/DrugCombAccuracyCOTORM/mean": 0.4479166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.4069705307483673, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 6725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/mean_length": 484.25, "completions/min_length": 413.0, "epoch": 9.891176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.032829761505127, "kl": 0.011437075678259134, "learning_rate": 5.961602354231551e-07, "loss": 0.00011396408081054688, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 453.25, "completions/min_length": 407.0, "epoch": 9.89264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.016918078064918518, "kl": 0.0115462401881814, "learning_rate": 5.960342949225908e-07, "loss": 0.00011596528202062473, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 455.875, "completions/min_length": 402.0, "epoch": 9.894117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.012857372872531414, "kl": 0.009118915419094265, "learning_rate": 5.959083480955303e-07, "loss": 9.189491538563743e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 429.0625, "completions/min_length": 389.0, "epoch": 9.895588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9158901572227478, "kl": 0.012891779653728008, "learning_rate": 5.957823949502705e-07, "loss": 0.00012906899792142212, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 435.5, "completions/min_length": 381.0, "epoch": 9.897058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 0.9078304171562195, "kl": 0.014366364805027843, "learning_rate": 5.95656435495109e-07, "loss": 0.00014461498358286917, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 462.1875, "completions/min_length": 414.0, "epoch": 9.898529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.3516138792037964, "kl": 0.011598070152103901, "learning_rate": 5.955304697383435e-07, "loss": 0.00011591613292694092, "reward": 0.8645833730697632, "reward_std": 0.24373173713684082, "rewards/DrugCombAccuracyCOTORM/mean": 0.8541666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.2713136672973633, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.3095695972442627, "step": 6731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 482.375, "completions/min_length": 445.0, "epoch": 9.9, "frac_reward_zero_std": 1.0, "grad_norm": 0.013939385302364826, "kl": 0.008723770850338042, "learning_rate": 5.954044976882723e-07, "loss": 8.661411993671209e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/mean_length": 474.0, "completions/min_length": 360.0, "epoch": 9.901470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.024196311831474304, "kl": 0.011333226226270199, "learning_rate": 5.952785193531945e-07, "loss": 0.00011624557373579592, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 442.5625, "completions/min_length": 407.0, "epoch": 9.902941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.017810406163334846, "kl": 0.009752546669915318, "learning_rate": 5.951525347414088e-07, "loss": 9.869420318864286e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 437.3125, "completions/min_length": 391.0, "epoch": 9.904411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.017089534550905228, "kl": 0.01490681548602879, "learning_rate": 5.95026543861215e-07, "loss": 0.00014961484703235328, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 445.5, "completions/min_length": 380.0, "epoch": 9.905882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.013693376444280148, "kl": 0.009493384161032736, "learning_rate": 5.949005467209129e-07, "loss": 9.482220048084855e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/mean_length": 497.625, "completions/min_length": 416.0, "epoch": 9.907352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.972550392150879, "kl": 0.020647745113819838, "learning_rate": 5.947745433288029e-07, "loss": 0.0002069920301437378, "reward": 0.7836250066757202, "reward_std": 0.3247252106666565, "rewards/DrugCombAccuracyCOTORM/mean": 0.7464583516120911, "rewards/DrugCombAccuracyCOTORM/std": 0.39545655250549316, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8645833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.286865234375, "step": 6737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 484.9375, "completions/min_length": 425.0, "epoch": 9.908823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.08039306849241257, "kl": 0.010092323180288076, "learning_rate": 5.94648533693186e-07, "loss": 0.00010127038694918156, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 471.5, "completions/min_length": 430.0, "epoch": 9.910294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9209877252578735, "kl": 0.009191410150378942, "learning_rate": 5.94522517822363e-07, "loss": 9.196624159812927e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 6739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 429.625, "completions/min_length": 394.0, "epoch": 9.911764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.1843498945236206, "kl": 0.011839017504826188, "learning_rate": 5.943964957246359e-07, "loss": 0.0001179407408926636, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/mean_length": 441.5, "completions/min_length": 317.0, "epoch": 9.913235294117648, "frac_reward_zero_std": 0.0, "grad_norm": 1.1666545867919922, "kl": 0.009115605964325368, "learning_rate": 5.942704674083065e-07, "loss": 8.991360664367676e-05, "reward": 0.762499988079071, "reward_std": 0.4397645592689514, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.7187952995300293, "step": 6741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 469.4375, "completions/min_length": 407.0, "epoch": 9.91470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.0655903816223145, "kl": 0.01099497638642788, "learning_rate": 5.941444328816774e-07, "loss": 0.00010947883129119873, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 497.1875, "completions/min_length": 457.0, "epoch": 9.916176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.7781286239624023, "kl": 0.011883385246619582, "learning_rate": 5.940183921530512e-07, "loss": 0.00011883841216331348, "reward": 0.9178333282470703, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.9025000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.26642072200775146, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 6743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 461.125, "completions/min_length": 390.0, "epoch": 9.91764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.362448811531067, "kl": 0.011647771811112761, "learning_rate": 5.938923452307312e-07, "loss": 0.00011630356311798096, "reward": 0.84375, "reward_std": 0.3442630469799042, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 420.8125, "completions/min_length": 329.0, "epoch": 9.919117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.1285652369260788, "kl": 0.015287209418602288, "learning_rate": 5.937662921230211e-07, "loss": 0.00014801220095250756, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 472.875, "completions/min_length": 395.0, "epoch": 9.920588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.019503116607666, "kl": 0.012164085870608687, "learning_rate": 5.93640232838225e-07, "loss": 0.00012155622243881226, "reward": 0.6180000305175781, "reward_std": 0.02074180170893669, "rewards/DrugCombAccuracyCOTORM/mean": 0.5641666650772095, "rewards/DrugCombAccuracyCOTORM/std": 0.4515184462070465, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.3442651927471161, "step": 6746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/mean_length": 477.1875, "completions/min_length": 426.0, "epoch": 9.922058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.9314186573028564, "kl": 0.008449915796518326, "learning_rate": 5.935141673846473e-07, "loss": 8.452683687210083e-05, "reward": 0.8999999761581421, "reward_std": 0.1380131095647812, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.2687419056892395, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 462.75, "completions/min_length": 396.0, "epoch": 9.923529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.7864665985107422, "kl": 0.01016107713803649, "learning_rate": 5.933880957705931e-07, "loss": 0.00010195241338806227, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/mean_length": 480.1875, "completions/min_length": 370.0, "epoch": 9.925, "frac_reward_zero_std": 0.5, "grad_norm": 0.9177362322807312, "kl": 0.010294357663951814, "learning_rate": 5.932620180043674e-07, "loss": 0.00010326354822609574, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 495.0, "completions/min_length": 435.0, "epoch": 9.926470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 0.8804269433021545, "kl": 0.01450467319227755, "learning_rate": 5.931359340942758e-07, "loss": 0.00014533838839270175, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 389.1875, "completions/min_length": 304.0, "epoch": 9.927941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.011979801580309868, "kl": 0.00923549709841609, "learning_rate": 5.930098440486248e-07, "loss": 9.326584404334426e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 6751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/mean_length": 525.0, "completions/min_length": 464.0, "epoch": 9.929411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.9579317569732666, "kl": 0.010471437126398087, "learning_rate": 5.928837478757205e-07, "loss": 0.00010308740456821397, "reward": 0.7458333373069763, "reward_std": 0.21058309078216553, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 6752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 455.5, "completions/min_length": 393.0, "epoch": 9.930882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.8528057336807251, "kl": 0.01220775826368481, "learning_rate": 5.927576455838698e-07, "loss": 0.00012266141129657626, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 446.3125, "completions/min_length": 408.0, "epoch": 9.93235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.017703814432024956, "kl": 0.008137068594805896, "learning_rate": 5.926315371813806e-07, "loss": 8.116591925499961e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 418.875, "completions/min_length": 348.0, "epoch": 9.933823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.0659283772110939, "kl": 0.015115629648789763, "learning_rate": 5.925054226765598e-07, "loss": 0.0001513191091362387, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 495.5625, "completions/min_length": 440.0, "epoch": 9.935294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.016648264601826668, "kl": 0.01076267403550446, "learning_rate": 5.923793020777159e-07, "loss": 0.0001070656071533449, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 6756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 432.5625, "completions/min_length": 368.0, "epoch": 9.936764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 1.4377341270446777, "kl": 0.01279247528873384, "learning_rate": 5.922531753931574e-07, "loss": 0.00012628364493139088, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 6757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 414.3125, "completions/min_length": 365.0, "epoch": 9.938235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 1.2691437005996704, "kl": 0.011103201191872358, "learning_rate": 5.921270426311931e-07, "loss": 0.00011141767754452303, "reward": 0.942187488079071, "reward_std": 0.16351844370365143, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/mean_length": 475.3125, "completions/min_length": 389.0, "epoch": 9.939705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 1.264741063117981, "kl": 0.011749025899916887, "learning_rate": 5.920009038001325e-07, "loss": 0.00011654198169708252, "reward": 0.887499988079071, "reward_std": 0.3181980550289154, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 6759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.0, "completions/mean_length": 513.75, "completions/min_length": 376.0, "epoch": 9.941176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.2073382139205933, "kl": 0.011498028645291924, "learning_rate": 5.918747589082852e-07, "loss": 0.00011433314648456872, "reward": 0.743154764175415, "reward_std": 0.13084682822227478, "rewards/DrugCombAccuracyCOTORM/mean": 0.6919642686843872, "rewards/DrugCombAccuracyCOTORM/std": 0.38719770312309265, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.15957117080688477, "step": 6760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 454.5625, "completions/min_length": 413.0, "epoch": 9.94264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.019023165106773376, "kl": 0.012571154162287712, "learning_rate": 5.917486079639612e-07, "loss": 0.00012528777006082237, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 482.8125, "completions/min_length": 416.0, "epoch": 9.944117647058823, "frac_reward_zero_std": 0.0, "grad_norm": 1.5896621942520142, "kl": 0.010485473787412047, "learning_rate": 5.916224509754712e-07, "loss": 0.00010485947132110596, "reward": 0.5874999761581421, "reward_std": 0.3951810300350189, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 494.0, "completions/min_length": 444.0, "epoch": 9.945588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.8699357509613037, "kl": 0.012808118714019656, "learning_rate": 5.914962879511258e-07, "loss": 0.0001276734983548522, "reward": 0.606249988079071, "reward_std": 0.16132819652557373, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.6291528940200806, "step": 6763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 449.4375, "completions/min_length": 386.0, "epoch": 9.947058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.01660994254052639, "kl": 0.008589790668338537, "learning_rate": 5.913701188992366e-07, "loss": 8.611659723101184e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 457.25, "completions/min_length": 421.0, "epoch": 9.948529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.057836003601551056, "kl": 0.011879905825480819, "learning_rate": 5.912439438281151e-07, "loss": 0.00011936856026295573, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 474.375, "completions/min_length": 426.0, "epoch": 9.95, "frac_reward_zero_std": 0.5, "grad_norm": 1.0853339433670044, "kl": 0.011911319801583886, "learning_rate": 5.911177627460738e-07, "loss": 0.00011945827282033861, "reward": 0.4749999940395355, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 6766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/mean_length": 450.0625, "completions/min_length": 363.0, "epoch": 9.951470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.0419652462005615, "kl": 0.009934014407917857, "learning_rate": 5.909915756614247e-07, "loss": 9.954607958206907e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 414.125, "completions/min_length": 364.0, "epoch": 9.952941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.03725894168019295, "kl": 0.015818243846297264, "learning_rate": 5.908653825824808e-07, "loss": 0.0001587997394381091, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 438.25, "completions/min_length": 374.0, "epoch": 9.954411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.015277853235602379, "kl": 0.011112296022474766, "learning_rate": 5.907391835175555e-07, "loss": 0.00011031015310436487, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 464.625, "completions/min_length": 388.0, "epoch": 9.955882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.010282758623361588, "kl": 0.007396181928925216, "learning_rate": 5.906129784749624e-07, "loss": 7.354572881013155e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/mean_length": 498.0, "completions/min_length": 440.0, "epoch": 9.95735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8689300417900085, "kl": 0.010678548365831375, "learning_rate": 5.904867674630155e-07, "loss": 0.0001065284013748169, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 6771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 444.875, "completions/min_length": 394.0, "epoch": 9.958823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.01645508222281933, "kl": 0.010274000931531191, "learning_rate": 5.903605504900296e-07, "loss": 0.00010226931772194803, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 459.6875, "completions/min_length": 411.0, "epoch": 9.96029411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.011187382973730564, "kl": 0.008267137571237981, "learning_rate": 5.902343275643192e-07, "loss": 8.23816517367959e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 444.375, "completions/min_length": 382.0, "epoch": 9.961764705882352, "frac_reward_zero_std": 0.5, "grad_norm": 1.237773060798645, "kl": 0.014815691509284079, "learning_rate": 5.901080986941995e-07, "loss": 0.00014542043209075928, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 6774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 453.8125, "completions/min_length": 417.0, "epoch": 9.963235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9461926817893982, "kl": 0.010656771366484463, "learning_rate": 5.899818638879864e-07, "loss": 0.00010570138692855835, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 408.25, "completions/min_length": 326.0, "epoch": 9.964705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.030663471668958664, "kl": 0.009355498012155294, "learning_rate": 5.898556231539958e-07, "loss": 9.470323857385665e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 431.3125, "completions/min_length": 395.0, "epoch": 9.966176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.037038370966911316, "kl": 0.011176663683727384, "learning_rate": 5.897293765005443e-07, "loss": 0.00011133226507809013, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 458.375, "completions/min_length": 386.0, "epoch": 9.967647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.8036416172981262, "kl": 0.010504282778128982, "learning_rate": 5.896031239359485e-07, "loss": 0.00010567301069386303, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 422.9375, "completions/min_length": 360.0, "epoch": 9.969117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.012775523588061333, "kl": 0.00848882191348821, "learning_rate": 5.894768654685254e-07, "loss": 8.591412915848196e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 447.625, "completions/min_length": 394.0, "epoch": 9.970588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.016197534278035164, "kl": 0.008786565507762134, "learning_rate": 5.89350601106593e-07, "loss": 8.789062121650204e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 421.5, "completions/min_length": 381.0, "epoch": 9.972058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.9820442795753479, "kl": 0.0117159360088408, "learning_rate": 5.892243308584692e-07, "loss": 0.0001174781791632995, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/mean_length": 530.0, "completions/min_length": 475.0, "epoch": 9.973529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.9170713424682617, "kl": 0.011775968363508582, "learning_rate": 5.890980547324722e-07, "loss": 0.00011804843961726874, "reward": 0.6354166865348816, "reward_std": 0.1462324559688568, "rewards/DrugCombAccuracyCOTORM/mean": 0.5833333134651184, "rewards/DrugCombAccuracyCOTORM/std": 0.4791968762874603, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.6020797491073608, "step": 6782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 478.6875, "completions/min_length": 399.0, "epoch": 9.975, "frac_reward_zero_std": 0.5, "grad_norm": 1.6730971336364746, "kl": 0.019165946170687675, "learning_rate": 5.889717727369209e-07, "loss": 0.0001910054124891758, "reward": 0.7868333458900452, "reward_std": 0.16055183112621307, "rewards/DrugCombAccuracyCOTORM/mean": 0.7439583539962769, "rewards/DrugCombAccuracyCOTORM/std": 0.37054356932640076, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.16101530194282532, "step": 6783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/mean_length": 471.4375, "completions/min_length": 390.0, "epoch": 9.976470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8045142889022827, "kl": 0.008578501758165658, "learning_rate": 5.888454848801344e-07, "loss": 8.441507816314697e-05, "reward": 0.960812509059906, "reward_std": 0.11083897948265076, "rewards/DrugCombAccuracyCOTORM/mean": 0.9529687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.18812499940395355, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.0625, "step": 6784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 454.875, "completions/min_length": 406.0, "epoch": 9.977941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.013205161318182945, "kl": 0.009320091689005494, "learning_rate": 5.887191911704322e-07, "loss": 9.332541958428919e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/mean_length": 416.5, "completions/min_length": 376.0, "epoch": 9.979411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.03217218071222305, "kl": 0.011998404283076525, "learning_rate": 5.88592891616134e-07, "loss": 0.0001211049166158773, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 455.4375, "completions/min_length": 403.0, "epoch": 9.980882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.7629821300506592, "kl": 0.012028994504362345, "learning_rate": 5.884665862255604e-07, "loss": 0.00012004747986793518, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 397.625, "completions/min_length": 294.0, "epoch": 9.98235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.02300916612148285, "kl": 0.011380567448213696, "learning_rate": 5.88340275007032e-07, "loss": 0.00011347817780915648, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 422.25, "completions/min_length": 394.0, "epoch": 9.983823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.015383192338049412, "kl": 0.008808697923086584, "learning_rate": 5.882139579688699e-07, "loss": 8.863718539942056e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 467.125, "completions/min_length": 421.0, "epoch": 9.985294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.009430239908397198, "kl": 0.008054811740294099, "learning_rate": 5.880876351193955e-07, "loss": 8.051971963141114e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 480.25, "completions/min_length": 442.0, "epoch": 9.986764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0031006336212158, "kl": 0.011383843258954585, "learning_rate": 5.879613064669306e-07, "loss": 0.0001138412844738923, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 467.4375, "completions/min_length": 402.0, "epoch": 9.988235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.012731525115668774, "kl": 0.009686734178103507, "learning_rate": 5.878349720197973e-07, "loss": 9.741855319589376e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 466.1875, "completions/min_length": 404.0, "epoch": 9.989705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.02524365670979023, "kl": 0.011135532055050135, "learning_rate": 5.877086317863184e-07, "loss": 0.00011156778055010363, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 422.125, "completions/min_length": 336.0, "epoch": 9.991176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.23223455250263214, "kl": 0.02675691992044449, "learning_rate": 5.875822857748168e-07, "loss": 0.0002752014552243054, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/mean_length": 503.625, "completions/min_length": 378.0, "epoch": 9.992647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.0316013097763062, "kl": 0.012719037709757686, "learning_rate": 5.874559339936159e-07, "loss": 0.00012806293671019375, "reward": 0.9368749856948853, "reward_std": 0.07085887342691422, "rewards/DrugCombAccuracyCOTORM/mean": 0.925000011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.1374368518590927, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 6795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 401.0625, "completions/min_length": 335.0, "epoch": 9.994117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.01053217425942421, "kl": 0.010330110089853406, "learning_rate": 5.873295764510394e-07, "loss": 0.000103034071798902, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 458.5625, "completions/min_length": 427.0, "epoch": 9.995588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.0859161615371704, "kl": 0.009348645573481917, "learning_rate": 5.872032131554115e-07, "loss": 9.366869926452637e-05, "reward": 0.9479166865348816, "reward_std": 0.08282582461833954, "rewards/DrugCombAccuracyCOTORM/mean": 0.9583333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.11385500431060791, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 6797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 503.375, "completions/min_length": 440.0, "epoch": 9.99705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.004022479057312, "kl": 0.009842860628850758, "learning_rate": 5.870768441150564e-07, "loss": 9.781867265701294e-05, "reward": 0.5979166626930237, "reward_std": 0.005892557092010975, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 6798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 449.4375, "completions/min_length": 415.0, "epoch": 9.998529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.011076582595705986, "kl": 0.009276153054088354, "learning_rate": 5.869504693382992e-07, "loss": 9.254281758330762e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 453.125, "completions/min_length": 402.0, "epoch": 10.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.05434055253863335, "kl": 0.011131529230624437, "learning_rate": 5.868240888334652e-07, "loss": 0.0001121445675380528, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 482.25, "completions/min_length": 424.0, "epoch": 10.001470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.8266306519508362, "kl": 0.011161613743752241, "learning_rate": 5.8669770260888e-07, "loss": 0.0001118828949984163, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/mean_length": 504.9375, "completions/min_length": 421.0, "epoch": 10.00294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9199860095977783, "kl": 0.009072000510059297, "learning_rate": 5.865713106728695e-07, "loss": 9.057480201590806e-05, "reward": 0.7625000476837158, "reward_std": 0.15612494945526123, "rewards/DrugCombAccuracyCOTORM/mean": 0.7291666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.3890872597694397, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.490653395652771, "step": 6802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 449.6875, "completions/min_length": 404.0, "epoch": 10.004411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.019876394420862198, "kl": 0.009400533745065331, "learning_rate": 5.864449130337601e-07, "loss": 9.340907854493707e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 440.125, "completions/min_length": 371.0, "epoch": 10.005882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.9735097289085388, "kl": 0.010232086991891265, "learning_rate": 5.863185096998786e-07, "loss": 0.00010236522211926058, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 6804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/mean_length": 514.5, "completions/min_length": 390.0, "epoch": 10.007352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.2723166942596436, "kl": 0.020013358211144805, "learning_rate": 5.861921006795521e-07, "loss": 0.00019446015357971191, "reward": 0.7333333492279053, "reward_std": 0.2997860312461853, "rewards/DrugCombAccuracyCOTORM/mean": 0.6666666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.42163705825805664, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 479.75, "completions/min_length": 407.0, "epoch": 10.008823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.02080538496375084, "kl": 0.012783480109646916, "learning_rate": 5.860656859811081e-07, "loss": 0.0001267180050490424, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 436.25, "completions/min_length": 389.0, "epoch": 10.010294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 1.1576838493347168, "kl": 0.01658615400083363, "learning_rate": 5.859392656128746e-07, "loss": 0.00016643224807921797, "reward": 0.8812500238418579, "reward_std": 0.2202879637479782, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5439056158065796, "step": 6807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 434.25, "completions/min_length": 358.0, "epoch": 10.011764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.015528917312622, "kl": 0.011274825897999108, "learning_rate": 5.858128395831798e-07, "loss": 0.00011422352690715343, "reward": 0.7246500253677368, "reward_std": 0.18139563500881195, "rewards/DrugCombAccuracyCOTORM/mean": 0.6823749542236328, "rewards/DrugCombAccuracyCOTORM/std": 0.42912089824676514, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7875000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.49581584334373474, "step": 6808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 417.75, "completions/min_length": 367.0, "epoch": 10.013235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.013604722917079926, "kl": 0.009546962566673756, "learning_rate": 5.856864079003523e-07, "loss": 9.506134665571153e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 434.3125, "completions/min_length": 363.0, "epoch": 10.014705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.024220971390604973, "kl": 0.009432381368242204, "learning_rate": 5.855599705727211e-07, "loss": 9.482933091931045e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 416.0625, "completions/min_length": 381.0, "epoch": 10.016176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.8039875030517578, "kl": 0.009467350784689188, "learning_rate": 5.854335276086153e-07, "loss": 9.438369306735694e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/mean_length": 534.9375, "completions/min_length": 436.0, "epoch": 10.01764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.038070496171712875, "kl": 0.011163485469296575, "learning_rate": 5.853070790163651e-07, "loss": 0.00011075756628997624, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 475.375, "completions/min_length": 363.0, "epoch": 10.019117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 1.396705150604248, "kl": 0.013402610318735242, "learning_rate": 5.851806248043005e-07, "loss": 0.00013640522956848145, "reward": 0.42500001192092896, "reward_std": 0.3683890998363495, "rewards/DrugCombAccuracyCOTORM/mean": 0.28125, "rewards/DrugCombAccuracyCOTORM/std": 0.44604745507240295, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/mean_length": 557.8125, "completions/min_length": 461.0, "epoch": 10.020588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.834072470664978, "kl": 0.01139144110493362, "learning_rate": 5.850541649807519e-07, "loss": 0.00011426210403442383, "reward": 0.8670925498008728, "reward_std": 0.12208949774503708, "rewards/DrugCombAccuracyCOTORM/mean": 0.8416781425476074, "rewards/DrugCombAccuracyCOTORM/std": 0.26089560985565186, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.11453071236610413, "step": 6814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/mean_length": 507.3125, "completions/min_length": 466.0, "epoch": 10.022058823529411, "frac_reward_zero_std": 0.0, "grad_norm": 1.4426114559173584, "kl": 0.011849492439068854, "learning_rate": 5.849276995540501e-07, "loss": 0.00011884048581123352, "reward": 0.5676667094230652, "reward_std": 0.35680919885635376, "rewards/DrugCombAccuracyCOTORM/mean": 0.5116666555404663, "rewards/DrugCombAccuracyCOTORM/std": 0.4806569814682007, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5833333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.4791968762874603, "step": 6815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 465.75, "completions/min_length": 419.0, "epoch": 10.023529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9818464517593384, "kl": 0.011229031020775437, "learning_rate": 5.848012285325263e-07, "loss": 0.00011233959958190098, "reward": 0.6625000238418579, "reward_std": 0.21001699566841125, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.6191391944885254, "step": 6816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/mean_length": 514.25, "completions/min_length": 423.0, "epoch": 10.025, "frac_reward_zero_std": 0.5, "grad_norm": 0.9126274585723877, "kl": 0.010864417999982834, "learning_rate": 5.846747519245122e-07, "loss": 0.00010818550072144717, "reward": 0.7214166522026062, "reward_std": 0.23668919503688812, "rewards/DrugCombAccuracyCOTORM/mean": 0.7012500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.46046173572540283, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6041666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.71200031042099, "step": 6817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 459.6875, "completions/min_length": 372.0, "epoch": 10.026470588235295, "frac_reward_zero_std": 0.0, "grad_norm": 1.4141185283660889, "kl": 0.011345570906996727, "learning_rate": 5.845482697383398e-07, "loss": 0.00011388957500457764, "reward": 0.660937488079071, "reward_std": 0.39946258068084717, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.7187952995300293, "step": 6818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/mean_length": 572.25, "completions/min_length": 505.0, "epoch": 10.027941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.0297974348068237, "kl": 0.011653855443000793, "learning_rate": 5.844217819823414e-07, "loss": 0.00011680275201797485, "reward": 0.2220284640789032, "reward_std": 0.026878032833337784, "rewards/DrugCombAccuracyCOTORM/mean": 0.11063020676374435, "rewards/DrugCombAccuracyCOTORM/std": 0.11900988966226578, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3352430462837219, "rewards/DrugCombCoverageCOTORM/std": 0.36063605546951294, "step": 6819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 477.8125, "completions/min_length": 373.0, "epoch": 10.029411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.1281652450561523, "kl": 0.010986157925799489, "learning_rate": 5.842952886648495e-07, "loss": 0.00010923318041022867, "reward": 0.925000011920929, "reward_std": 0.14880475401878357, "rewards/DrugCombAccuracyCOTORM/mean": 0.90625, "rewards/DrugCombAccuracyCOTORM/std": 0.2719528079032898, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 425.625, "completions/min_length": 374.0, "epoch": 10.030882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.011199341155588627, "kl": 0.010151647846214473, "learning_rate": 5.841687897941974e-07, "loss": 0.00010155822383239865, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/mean_length": 492.25, "completions/min_length": 405.0, "epoch": 10.032352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.8554038405418396, "kl": 0.013436155626550317, "learning_rate": 5.840422853787184e-07, "loss": 0.00013488903641700745, "reward": 0.8625208139419556, "reward_std": 0.11020281910896301, "rewards/DrugCombAccuracyCOTORM/mean": 0.8346614837646484, "rewards/DrugCombAccuracyCOTORM/std": 0.2510888874530792, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9479166269302368, "rewards/DrugCombCoverageCOTORM/std": 0.07978560030460358, "step": 6822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 456.875, "completions/min_length": 388.0, "epoch": 10.033823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 1.2505097389221191, "kl": 0.009679927956312895, "learning_rate": 5.839157754267463e-07, "loss": 9.706616401672363e-05, "reward": 0.6852083206176758, "reward_std": 0.22957724332809448, "rewards/DrugCombAccuracyCOTORM/mean": 0.6312500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.3829454779624939, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8020833730697632, "rewards/DrugCombCoverageCOTORM/std": 0.21273136138916016, "step": 6823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/mean_length": 575.0, "completions/min_length": 447.0, "epoch": 10.035294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.7423175573348999, "kl": 0.011173592181876302, "learning_rate": 5.837892599466154e-07, "loss": 0.00011321902275085449, "reward": 0.8412894010543823, "reward_std": 0.1553107351064682, "rewards/DrugCombAccuracyCOTORM/mean": 0.807601273059845, "rewards/DrugCombAccuracyCOTORM/std": 0.3245069980621338, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9520833492279053, "rewards/DrugCombCoverageCOTORM/std": 0.08603940159082413, "step": 6824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 446.1875, "completions/min_length": 382.0, "epoch": 10.036764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.027151264250278473, "kl": 0.012141804909333587, "learning_rate": 5.836627389466601e-07, "loss": 0.00012105036148568615, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 502.0, "completions/min_length": 438.0, "epoch": 10.038235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.012853855267167091, "kl": 0.008730708970688283, "learning_rate": 5.835362124352151e-07, "loss": 8.734122820897028e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/mean_length": 511.375, "completions/min_length": 451.0, "epoch": 10.03970588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9668943881988525, "kl": 0.015200129710137844, "learning_rate": 5.834096804206159e-07, "loss": 0.00015158698079176247, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 735.0, "completions/mean_length": 527.8125, "completions/min_length": 332.0, "epoch": 10.041176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.8992544412612915, "kl": 0.010478481417521834, "learning_rate": 5.832831429111981e-07, "loss": 0.00010809909144882113, "reward": 0.5830357074737549, "reward_std": 0.0008417930221185088, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8303571343421936, "rewards/DrugCombCoverageCOTORM/std": 0.17558346688747406, "step": 6828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 455.3125, "completions/min_length": 397.0, "epoch": 10.04264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.017479846253991127, "kl": 0.010131336515769362, "learning_rate": 5.831565999152975e-07, "loss": 0.00010176612704526633, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 472.6875, "completions/min_length": 417.0, "epoch": 10.044117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.01970859430730343, "kl": 0.013213608879595995, "learning_rate": 5.830300514412506e-07, "loss": 0.0001317404821747914, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 6830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/mean_length": 517.25, "completions/min_length": 409.0, "epoch": 10.045588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.0232491493225098, "kl": 0.012487960746511817, "learning_rate": 5.829034974973941e-07, "loss": 0.00012564027565531433, "reward": 0.8902592062950134, "reward_std": 0.13419082760810852, "rewards/DrugCombAccuracyCOTORM/mean": 0.8758448362350464, "rewards/DrugCombAccuracyCOTORM/std": 0.24784055352210999, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.291070818901062, "step": 6831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/mean_length": 516.125, "completions/min_length": 425.0, "epoch": 10.047058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.821324348449707, "kl": 0.009264311753213406, "learning_rate": 5.82776938092065e-07, "loss": 9.272992610931396e-05, "reward": 0.8763333559036255, "reward_std": 0.11290507018566132, "rewards/DrugCombAccuracyCOTORM/mean": 0.8506250381469727, "rewards/DrugCombAccuracyCOTORM/std": 0.23239246010780334, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.1666666567325592, "step": 6832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 467.1875, "completions/min_length": 384.0, "epoch": 10.048529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.130367398262024, "kl": 0.010848693316802382, "learning_rate": 5.826503732336006e-07, "loss": 0.00010786578059196472, "reward": 0.625, "reward_std": 0.43847471475601196, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 6833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 456.0, "completions/min_length": 385.0, "epoch": 10.05, "frac_reward_zero_std": 1.0, "grad_norm": 0.031459614634513855, "kl": 0.010374097153544426, "learning_rate": 5.825238029303387e-07, "loss": 0.0001038277187035419, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 426.125, "completions/min_length": 368.0, "epoch": 10.051470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.01991955377161503, "kl": 0.009135361411608756, "learning_rate": 5.823972271906177e-07, "loss": 9.151407721219584e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 413.4375, "completions/min_length": 357.0, "epoch": 10.052941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.05466442555189133, "kl": 0.011890405789017677, "learning_rate": 5.822706460227757e-07, "loss": 0.00011959132098127156, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 468.5, "completions/min_length": 415.0, "epoch": 10.054411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.8934885263442993, "kl": 0.010058063548058271, "learning_rate": 5.82144059435152e-07, "loss": 0.00010061623470392078, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 459.8125, "completions/min_length": 398.0, "epoch": 10.055882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.04058632627129555, "kl": 0.01271802932024002, "learning_rate": 5.820174674360854e-07, "loss": 0.00012526843056548387, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 6838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 479.4375, "completions/min_length": 414.0, "epoch": 10.05735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.25214123725891113, "kl": 0.01527584926225245, "learning_rate": 5.818908700339155e-07, "loss": 0.0001546464627608657, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 486.0625, "completions/min_length": 430.0, "epoch": 10.058823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.03135108947753906, "kl": 0.010284089948982, "learning_rate": 5.817642672369825e-07, "loss": 0.00010317983833374456, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/mean_length": 496.8125, "completions/min_length": 380.0, "epoch": 10.060294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9781702160835266, "kl": 0.009900673874653876, "learning_rate": 5.816376590536264e-07, "loss": 9.734369814395905e-05, "reward": 0.5680624842643738, "reward_std": 0.05475036799907684, "rewards/DrugCombAccuracyCOTORM/mean": 0.5206249952316284, "rewards/DrugCombAccuracyCOTORM/std": 0.49783825874328613, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.53125, "rewards/DrugCombCoverageCOTORM/std": 0.6700435280799866, "step": 6841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 458.5, "completions/min_length": 383.0, "epoch": 10.061764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 1.023355484008789, "kl": 0.00945719110313803, "learning_rate": 5.815110454921879e-07, "loss": 9.3899667263031e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 508.8125, "completions/min_length": 442.0, "epoch": 10.063235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 0.8325799703598022, "kl": 0.012523811194114387, "learning_rate": 5.81384426561008e-07, "loss": 0.0001260894350707531, "reward": 0.875094473361969, "reward_std": 0.0680520161986351, "rewards/DrugCombAccuracyCOTORM/mean": 0.8846666812896729, "rewards/DrugCombAccuracyCOTORM/std": 0.1550111025571823, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6736111044883728, "rewards/DrugCombCoverageCOTORM/std": 0.6672451496124268, "step": 6843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 437.125, "completions/min_length": 399.0, "epoch": 10.064705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.09427390992641449, "kl": 0.014450011309236288, "learning_rate": 5.812578022684281e-07, "loss": 0.00014599240967072546, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 451.5625, "completions/min_length": 376.0, "epoch": 10.066176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.7390778064727783, "kl": 0.01067588187288493, "learning_rate": 5.811311726227899e-07, "loss": 0.00010581314563751221, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 472.0625, "completions/min_length": 449.0, "epoch": 10.06764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.012736010365188122, "kl": 0.009132056729868054, "learning_rate": 5.810045376324352e-07, "loss": 9.088587830774486e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 460.625, "completions/min_length": 401.0, "epoch": 10.069117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.8281378746032715, "kl": 0.009959811461158097, "learning_rate": 5.808778973057066e-07, "loss": 9.971487452276051e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 6847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 450.5, "completions/min_length": 381.0, "epoch": 10.070588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.1871461868286133, "kl": 0.013833870878443122, "learning_rate": 5.807512516509467e-07, "loss": 0.00013912351278122514, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 401.0625, "completions/min_length": 344.0, "epoch": 10.072058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.015746260061860085, "kl": 0.010871572187170386, "learning_rate": 5.80624600676499e-07, "loss": 0.00010803636541822925, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/mean_length": 522.6875, "completions/min_length": 408.0, "epoch": 10.073529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 0.7720907926559448, "kl": 0.011320243123918772, "learning_rate": 5.804979443907064e-07, "loss": 0.00011355429887771606, "reward": 0.942187488079071, "reward_std": 0.16351844370365143, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/mean_length": 521.8125, "completions/min_length": 394.0, "epoch": 10.075, "frac_reward_zero_std": 0.5, "grad_norm": 0.7824695706367493, "kl": 0.009174249018542469, "learning_rate": 5.803712828019131e-07, "loss": 9.12554532987997e-05, "reward": 0.6764881014823914, "reward_std": 0.1489594578742981, "rewards/DrugCombAccuracyCOTORM/mean": 0.6242559552192688, "rewards/DrugCombAccuracyCOTORM/std": 0.45448485016822815, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7708333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.291070818901062, "step": 6851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 482.375, "completions/min_length": 443.0, "epoch": 10.076470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.0188186876475811, "kl": 0.008654749719426036, "learning_rate": 5.802446159184628e-07, "loss": 8.666068606544286e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 450.5, "completions/min_length": 365.0, "epoch": 10.077941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.0163061618804932, "kl": 0.010470901150256395, "learning_rate": 5.801179437487005e-07, "loss": 0.00010342895984649658, "reward": 0.574999988079071, "reward_std": 0.04629100486636162, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 6853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/mean_length": 476.8125, "completions/min_length": 384.0, "epoch": 10.079411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.5107674598693848, "kl": 0.022211568895727396, "learning_rate": 5.799912663009709e-07, "loss": 0.00022619962692260742, "reward": 0.7746666669845581, "reward_std": 0.11295006424188614, "rewards/DrugCombAccuracyCOTORM/mean": 0.7287499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.34013479948043823, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.08606630563735962, "step": 6854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 466.5, "completions/min_length": 404.0, "epoch": 10.080882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.8911947011947632, "kl": 0.009423078736290336, "learning_rate": 5.79864583583619e-07, "loss": 9.43988561630249e-05, "reward": 0.5874999761581421, "reward_std": 0.0353553369641304, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 421.4375, "completions/min_length": 388.0, "epoch": 10.08235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.178147554397583, "kl": 0.01334210461936891, "learning_rate": 5.797378956049904e-07, "loss": 0.00013324532483238727, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 477.875, "completions/min_length": 395.0, "epoch": 10.083823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 1.4548813104629517, "kl": 0.016943786293268204, "learning_rate": 5.796112023734311e-07, "loss": 0.00016732513904571533, "reward": 0.5796874761581421, "reward_std": 0.3214433789253235, "rewards/DrugCombAccuracyCOTORM/mean": 0.53125, "rewards/DrugCombAccuracyCOTORM/std": 0.4989572763442993, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.8139410614967346, "step": 6857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 430.25, "completions/min_length": 384.0, "epoch": 10.08529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.012610657140612602, "kl": 0.008543720003217459, "learning_rate": 5.794845038972871e-07, "loss": 8.611418888904154e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/mean_length": 530.625, "completions/min_length": 435.0, "epoch": 10.086764705882352, "frac_reward_zero_std": 1.0, "grad_norm": 0.011587467044591904, "kl": 0.00811103405430913, "learning_rate": 5.793578001849053e-07, "loss": 8.166619227267802e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 6859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 420.4375, "completions/min_length": 379.0, "epoch": 10.088235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.024577975273132324, "kl": 0.007914547226391733, "learning_rate": 5.792310912446326e-07, "loss": 7.937510963529348e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 464.5, "completions/min_length": 414.0, "epoch": 10.089705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.79547518491745, "kl": 0.009443509392440319, "learning_rate": 5.791043770848158e-07, "loss": 9.419023990631104e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/mean_length": 404.625, "completions/min_length": 355.0, "epoch": 10.091176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.008501737378537655, "kl": 0.007482994697056711, "learning_rate": 5.78977657713803e-07, "loss": 7.493897282984108e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 426.75, "completions/min_length": 340.0, "epoch": 10.092647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.012991252355277538, "kl": 0.01047059812117368, "learning_rate": 5.788509331399418e-07, "loss": 0.00010644732537912205, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/mean_length": 467.9375, "completions/min_length": 366.0, "epoch": 10.094117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.535839557647705, "kl": 0.00763029872905463, "learning_rate": 5.787242033715807e-07, "loss": 7.689894846407697e-05, "reward": 0.9802083373069763, "reward_std": 0.055979274213314056, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 6864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 463.375, "completions/min_length": 397.0, "epoch": 10.095588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.02572019211947918, "kl": 0.01210906496271491, "learning_rate": 5.785974684170685e-07, "loss": 0.00012087903451174498, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 482.5, "completions/min_length": 457.0, "epoch": 10.097058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.9074629545211792, "kl": 0.011297776829451323, "learning_rate": 5.78470728284754e-07, "loss": 0.0001134723424911499, "reward": 0.7937500476837158, "reward_std": 0.36611872911453247, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 462.125, "completions/min_length": 415.0, "epoch": 10.098529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.014527622610330582, "kl": 0.011344633297994733, "learning_rate": 5.783439829829864e-07, "loss": 0.00011332138092257082, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/mean_length": 496.0625, "completions/min_length": 412.0, "epoch": 10.1, "frac_reward_zero_std": 0.5, "grad_norm": 0.9741957187652588, "kl": 0.01508043915964663, "learning_rate": 5.782172325201155e-07, "loss": 0.00015148959937505424, "reward": 0.550000011920929, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 490.8125, "completions/min_length": 459.0, "epoch": 10.101470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.019668355584144592, "kl": 0.012253757799044251, "learning_rate": 5.780904769044912e-07, "loss": 0.00012201358913443983, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 6869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/mean_length": 463.3125, "completions/min_length": 387.0, "epoch": 10.102941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 1.1800761222839355, "kl": 0.01388566242530942, "learning_rate": 5.779637161444638e-07, "loss": 0.00013659894466400146, "reward": 0.8374999761581421, "reward_std": 0.22638463973999023, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 494.625, "completions/min_length": 422.0, "epoch": 10.104411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.8982350826263428, "kl": 0.016693812562152743, "learning_rate": 5.778369502483843e-07, "loss": 0.00016829090600367635, "reward": 0.987333357334137, "reward_std": 0.03582672402262688, "rewards/DrugCombAccuracyCOTORM/mean": 0.98416668176651, "rewards/DrugCombAccuracyCOTORM/std": 0.06333333253860474, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 477.4375, "completions/min_length": 433.0, "epoch": 10.105882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.05523509904742241, "kl": 0.01395411358680576, "learning_rate": 5.777101792246036e-07, "loss": 0.0001395066356053576, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 441.75, "completions/min_length": 407.0, "epoch": 10.10735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.2035094499588013, "kl": 0.007854047697037458, "learning_rate": 5.775834030814725e-07, "loss": 7.86345117376186e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 463.3125, "completions/min_length": 407.0, "epoch": 10.108823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.9715315103530884, "kl": 0.013872420880943537, "learning_rate": 5.774566218273435e-07, "loss": 0.00013688504986930639, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 6874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 438.9375, "completions/min_length": 352.0, "epoch": 10.110294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.010893157683312893, "kl": 0.008230420062318444, "learning_rate": 5.773298354705683e-07, "loss": 8.286711818072945e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 495.0, "completions/min_length": 441.0, "epoch": 10.111764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.02746065892279148, "kl": 0.007741348003037274, "learning_rate": 5.77203044019499e-07, "loss": 7.771434320602566e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 479.75, "completions/min_length": 376.0, "epoch": 10.113235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8574566841125488, "kl": 0.01060469076037407, "learning_rate": 5.770762474824886e-07, "loss": 0.00010724365711212158, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 495.9375, "completions/min_length": 444.0, "epoch": 10.114705882352942, "frac_reward_zero_std": 0.0, "grad_norm": 1.2955886125564575, "kl": 0.01362194656394422, "learning_rate": 5.769494458678903e-07, "loss": 0.00013560056686401367, "reward": 0.84375, "reward_std": 0.3442630469799042, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 483.25, "completions/min_length": 399.0, "epoch": 10.116176470588234, "frac_reward_zero_std": 0.5, "grad_norm": 1.5603570938110352, "kl": 0.013074184418655932, "learning_rate": 5.76822639184057e-07, "loss": 0.00013115620822645724, "reward": 0.6786458492279053, "reward_std": 0.17891646921634674, "rewards/DrugCombAccuracyCOTORM/mean": 0.6354166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.46435439586639404, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.71875, "rewards/DrugCombCoverageCOTORM/std": 0.5467708706855774, "step": 6879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 496.875, "completions/min_length": 461.0, "epoch": 10.117647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.9261417984962463, "kl": 0.012963120825588703, "learning_rate": 5.766958274393428e-07, "loss": 0.00012928247451782227, "reward": 0.7729166746139526, "reward_std": 0.24526265263557434, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7291666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.6800735592842102, "step": 6880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 437.875, "completions/min_length": 350.0, "epoch": 10.119117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.00982409343123436, "kl": 0.008031970239244401, "learning_rate": 5.765690106421014e-07, "loss": 7.994195038918406e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/mean_length": 382.25, "completions/min_length": 338.0, "epoch": 10.120588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.0585829019546509, "kl": 0.01307349163107574, "learning_rate": 5.764421888006874e-07, "loss": 0.00012917816638946533, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 491.8125, "completions/min_length": 416.0, "epoch": 10.12205882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.8030232191085815, "kl": 0.00865525845438242, "learning_rate": 5.763153619234556e-07, "loss": 8.672413241583854e-05, "reward": 0.9239583015441895, "reward_std": 0.1609395444393158, "rewards/DrugCombAccuracyCOTORM/mean": 0.9166666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.2719528079032898, "step": 6883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/mean_length": 505.875, "completions/min_length": 412.0, "epoch": 10.123529411764705, "frac_reward_zero_std": 0.0, "grad_norm": 1.241706371307373, "kl": 0.008487029001116753, "learning_rate": 5.761885300187608e-07, "loss": 8.461624383926392e-05, "reward": 0.7437499761581421, "reward_std": 0.36251845955848694, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 460.875, "completions/min_length": 401.0, "epoch": 10.125, "frac_reward_zero_std": 0.5, "grad_norm": 0.863830029964447, "kl": 0.008239720133133233, "learning_rate": 5.760616930949584e-07, "loss": 8.228339720517397e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 436.5, "completions/min_length": 406.0, "epoch": 10.126470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.022558720782399178, "kl": 0.01444349205121398, "learning_rate": 5.759348511604042e-07, "loss": 0.00014400589861907065, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 429.6875, "completions/min_length": 356.0, "epoch": 10.12794117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.012735041789710522, "kl": 0.008601332549005747, "learning_rate": 5.758080042234542e-07, "loss": 8.528225589543581e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 453.1875, "completions/min_length": 390.0, "epoch": 10.129411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.3516727685928345, "kl": 0.00955786497797817, "learning_rate": 5.756811522924645e-07, "loss": 9.399652481079102e-05, "reward": 0.9178333282470703, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.9025000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.26642072200775146, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 6888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/mean_length": 500.75, "completions/min_length": 458.0, "epoch": 10.130882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.027352698147296906, "kl": 0.013897576602175832, "learning_rate": 5.755542953757923e-07, "loss": 0.00013985950499773026, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 410.3125, "completions/min_length": 367.0, "epoch": 10.132352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.009999844245612621, "kl": 0.008134780335240066, "learning_rate": 5.754274334817943e-07, "loss": 8.114735828712583e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 440.4375, "completions/min_length": 369.0, "epoch": 10.133823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 0.6924300789833069, "kl": 0.007245665765367448, "learning_rate": 5.753005666188277e-07, "loss": 7.218867540359497e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 425.0, "completions/min_length": 360.0, "epoch": 10.135294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.012652146629989147, "kl": 0.01027257600799203, "learning_rate": 5.751736947952503e-07, "loss": 0.00010176428622798994, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 416.75, "completions/min_length": 367.0, "epoch": 10.136764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.2274271249771118, "kl": 0.012578781228512526, "learning_rate": 5.750468180194204e-07, "loss": 0.00012504214828368276, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 762.0, "completions/mean_length": 564.8125, "completions/min_length": 409.0, "epoch": 10.138235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.3497953414916992, "kl": 0.013294950826093554, "learning_rate": 5.749199362996958e-07, "loss": 0.000135030597448349, "reward": 0.5904732346534729, "reward_std": 0.37629058957099915, "rewards/DrugCombAccuracyCOTORM/mean": 0.5101835131645203, "rewards/DrugCombAccuracyCOTORM/std": 0.4686484932899475, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8232638835906982, "rewards/DrugCombCoverageCOTORM/std": 0.2982129156589508, "step": 6894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 442.0625, "completions/min_length": 409.0, "epoch": 10.139705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 0.9124181866645813, "kl": 0.010154447285458446, "learning_rate": 5.747930496444356e-07, "loss": 0.00010204315185546875, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 480.5, "completions/min_length": 398.0, "epoch": 10.141176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.014515269547700882, "kl": 0.010663866996765137, "learning_rate": 5.746661580619985e-07, "loss": 0.00010636939259711653, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 437.3125, "completions/min_length": 386.0, "epoch": 10.14264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8753309845924377, "kl": 0.010948054725304246, "learning_rate": 5.745392615607437e-07, "loss": 0.00010974874021485448, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 6897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 462.3125, "completions/min_length": 398.0, "epoch": 10.144117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.014278250746428967, "kl": 0.007670931285247207, "learning_rate": 5.744123601490311e-07, "loss": 7.672391075175256e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 455.5, "completions/min_length": 402.0, "epoch": 10.145588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.1806946992874146, "kl": 0.008747089537791908, "learning_rate": 5.742854538352206e-07, "loss": 8.71419906616211e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 462.1875, "completions/min_length": 397.0, "epoch": 10.147058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 0.8566210865974426, "kl": 0.009197724401019514, "learning_rate": 5.741585426276723e-07, "loss": 9.211084397975355e-05, "reward": 0.32533329725265503, "reward_std": 0.03209509700536728, "rewards/DrugCombAccuracyCOTORM/mean": 0.20428571105003357, "rewards/DrugCombAccuracyCOTORM/std": 0.04293960705399513, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6190476417541504, "rewards/DrugCombCoverageCOTORM/std": 0.13012002408504486, "step": 6900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 434.8125, "completions/min_length": 394.0, "epoch": 10.148529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.011148521676659584, "kl": 0.008887871983461082, "learning_rate": 5.740316265347469e-07, "loss": 8.847159915603697e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 456.875, "completions/min_length": 427.0, "epoch": 10.15, "frac_reward_zero_std": 1.0, "grad_norm": 0.010513534769415855, "kl": 0.00837901677004993, "learning_rate": 5.739047055648054e-07, "loss": 8.369727584067732e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 455.8125, "completions/min_length": 417.0, "epoch": 10.151470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.03366704657673836, "kl": 0.010279483976773918, "learning_rate": 5.737777797262088e-07, "loss": 0.00010319744615117088, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 464.0625, "completions/min_length": 398.0, "epoch": 10.152941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.00983559899032116, "kl": 0.0087992656044662, "learning_rate": 5.736508490273187e-07, "loss": 8.748270920477808e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 458.875, "completions/min_length": 398.0, "epoch": 10.154411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.010010511614382267, "kl": 0.009168271208181977, "learning_rate": 5.735239134764972e-07, "loss": 9.167879761662334e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 438.9375, "completions/min_length": 381.0, "epoch": 10.155882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.018888073042035103, "kl": 0.0101775435032323, "learning_rate": 5.733969730821063e-07, "loss": 0.00010205258877249435, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 417.75, "completions/min_length": 391.0, "epoch": 10.157352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.010263979434967041, "kl": 0.0072879677172750235, "learning_rate": 5.732700278525086e-07, "loss": 7.28223385522142e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 442.5, "completions/min_length": 351.0, "epoch": 10.158823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 1.3991724252700806, "kl": 0.02308673528023064, "learning_rate": 5.731430777960668e-07, "loss": 0.00023376010358333588, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 441.875, "completions/min_length": 392.0, "epoch": 10.160294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.015740985050797462, "kl": 0.009233086369931698, "learning_rate": 5.730161229211443e-07, "loss": 9.26206266740337e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 448.75, "completions/min_length": 393.0, "epoch": 10.161764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.7378716468811035, "kl": 0.006805967539548874, "learning_rate": 5.728891632361042e-07, "loss": 6.755038339179009e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/mean_length": 580.5, "completions/min_length": 512.0, "epoch": 10.163235294117648, "frac_reward_zero_std": 0.0, "grad_norm": 1.1311697959899902, "kl": 0.011643956415355206, "learning_rate": 5.727621987493107e-07, "loss": 0.00011770427227020264, "reward": 0.9681249856948853, "reward_std": 0.09015608578920364, "rewards/DrugCombAccuracyCOTORM/mean": 0.9666666984558105, "rewards/DrugCombAccuracyCOTORM/std": 0.0942808985710144, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9479166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.145535409450531, "step": 6911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 509.3125, "completions/min_length": 438.0, "epoch": 10.16470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.1611262559890747, "kl": 0.011981098214164376, "learning_rate": 5.726352294691274e-07, "loss": 0.0001202434505103156, "reward": 0.7945833206176758, "reward_std": 0.17010116577148438, "rewards/DrugCombAccuracyCOTORM/mean": 0.7562500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.3733965754508972, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.15957117080688477, "step": 6912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 440.9375, "completions/min_length": 397.0, "epoch": 10.166176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.024193590506911278, "kl": 0.010504517704248428, "learning_rate": 5.725082554039192e-07, "loss": 0.00010390386887593195, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 467.6875, "completions/min_length": 400.0, "epoch": 10.16764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0561754703521729, "kl": 0.014553513610735536, "learning_rate": 5.723812765620507e-07, "loss": 0.00014490692410618067, "reward": 0.7854166626930237, "reward_std": 0.23153002560138702, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5013870000839233, "step": 6914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 474.125, "completions/min_length": 374.0, "epoch": 10.169117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.020824231207370758, "kl": 0.012084641959518194, "learning_rate": 5.722542929518867e-07, "loss": 0.00012148790119681507, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/mean_length": 502.5625, "completions/min_length": 402.0, "epoch": 10.170588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.8095874786376953, "kl": 0.00911504216492176, "learning_rate": 5.721273045817929e-07, "loss": 9.116530418395996e-05, "reward": 0.9177083373069763, "reward_std": 0.1607545167207718, "rewards/DrugCombAccuracyCOTORM/mean": 0.90625, "rewards/DrugCombAccuracyCOTORM/std": 0.2719528079032898, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9270833730697632, "rewards/DrugCombCoverageCOTORM/std": 0.20155644416809082, "step": 6916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 429.875, "completions/min_length": 400.0, "epoch": 10.172058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.03045889548957348, "kl": 0.013595255790278316, "learning_rate": 5.720003114601346e-07, "loss": 0.00013812299584969878, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 438.75, "completions/min_length": 367.0, "epoch": 10.173529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.04908022657036781, "kl": 0.01335983257740736, "learning_rate": 5.718733135952781e-07, "loss": 0.0001336819404968992, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/mean_length": 502.125, "completions/min_length": 431.0, "epoch": 10.175, "frac_reward_zero_std": 0.0, "grad_norm": 1.2112882137298584, "kl": 0.012523032259196043, "learning_rate": 5.717463109955895e-07, "loss": 0.00012662261724472046, "reward": 0.6484375, "reward_std": 0.32759973406791687, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.0625, "step": 6919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 433.75, "completions/min_length": 389.0, "epoch": 10.176470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.010792500339448452, "kl": 0.0085187500808388, "learning_rate": 5.716193036694358e-07, "loss": 8.45839167595841e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 6920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 455.0, "completions/min_length": 404.0, "epoch": 10.177941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.01331528089940548, "kl": 0.010471244342625141, "learning_rate": 5.714922916251834e-07, "loss": 0.00010506022226763889, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/mean_length": 541.75, "completions/min_length": 478.0, "epoch": 10.179411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.013862616382539272, "kl": 0.009099711896851659, "learning_rate": 5.713652748711997e-07, "loss": 9.050759399542585e-05, "reward": 0.800000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 452.625, "completions/min_length": 416.0, "epoch": 10.180882352941177, "frac_reward_zero_std": 0.0, "grad_norm": 1.3664252758026123, "kl": 0.014627793105319142, "learning_rate": 5.712382534158524e-07, "loss": 0.00014591217041015625, "reward": 0.8302261829376221, "reward_std": 0.25444793701171875, "rewards/DrugCombAccuracyCOTORM/mean": 0.7955952286720276, "rewards/DrugCombAccuracyCOTORM/std": 0.40038785338401794, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/mean_length": 499.875, "completions/min_length": 413.0, "epoch": 10.18235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9039769172668457, "kl": 0.013873161748051643, "learning_rate": 5.711112272675092e-07, "loss": 0.00013807043433189392, "reward": 0.7945833206176758, "reward_std": 0.17010116577148438, "rewards/DrugCombAccuracyCOTORM/mean": 0.7562500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.3733965754508972, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.15957117080688477, "step": 6924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 431.0625, "completions/min_length": 383.0, "epoch": 10.183823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.0060193538665771, "kl": 0.011374585912562907, "learning_rate": 5.709841964345382e-07, "loss": 0.0001138048100983724, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 6925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 467.75, "completions/min_length": 408.0, "epoch": 10.185294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.014101332053542137, "kl": 0.010210619657300413, "learning_rate": 5.708571609253083e-07, "loss": 0.00010160275996895507, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 416.5625, "completions/min_length": 390.0, "epoch": 10.186764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.011225545778870583, "kl": 0.00853115797508508, "learning_rate": 5.707301207481876e-07, "loss": 8.542728028260171e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 461.125, "completions/min_length": 392.0, "epoch": 10.188235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 1.0570988655090332, "kl": 0.013604442356154323, "learning_rate": 5.706030759115456e-07, "loss": 0.00013475865125656128, "reward": 0.8216458559036255, "reward_std": 0.13379503786563873, "rewards/DrugCombAccuracyCOTORM/mean": 0.8089583516120911, "rewards/DrugCombAccuracyCOTORM/std": 0.27370360493659973, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7604166269302368, "rewards/DrugCombCoverageCOTORM/std": 0.5304391980171204, "step": 6928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 451.75, "completions/min_length": 387.0, "epoch": 10.189705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9639624953269958, "kl": 0.011970706284046173, "learning_rate": 5.704760264237518e-07, "loss": 0.00011916458606719971, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/mean_length": 512.8125, "completions/min_length": 442.0, "epoch": 10.191176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.9870063066482544, "kl": 0.012640130938962102, "learning_rate": 5.703489722931757e-07, "loss": 0.0001248608314199373, "reward": 0.7822916507720947, "reward_std": 0.1684686839580536, "rewards/DrugCombAccuracyCOTORM/mean": 0.7708333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.3381595313549042, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.65625, "rewards/DrugCombCoverageCOTORM/std": 0.539096474647522, "step": 6930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/mean_length": 490.25, "completions/min_length": 410.0, "epoch": 10.19264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.230813980102539, "kl": 0.013835750985890627, "learning_rate": 5.702219135281871e-07, "loss": 0.00013675913214683533, "reward": 0.8999583721160889, "reward_std": 0.08503270149230957, "rewards/DrugCombAccuracyCOTORM/mean": 0.8801562786102295, "rewards/DrugCombAccuracyCOTORM/std": 0.18698236346244812, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.07453560829162598, "step": 6931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/mean_length": 514.25, "completions/min_length": 465.0, "epoch": 10.194117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.6932325959205627, "kl": 0.009495703503489494, "learning_rate": 5.70094850137157e-07, "loss": 9.569525718688965e-05, "reward": 0.9552500247955322, "reward_std": 0.1265721172094345, "rewards/DrugCombAccuracyCOTORM/mean": 0.9466666579246521, "rewards/DrugCombAccuracyCOTORM/std": 0.2133333384990692, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 6932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 463.8125, "completions/min_length": 366.0, "epoch": 10.195588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.012404325418174267, "kl": 0.009141274262219667, "learning_rate": 5.699677821284552e-07, "loss": 9.138803579844534e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 495.75, "completions/min_length": 473.0, "epoch": 10.197058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.024772047996520996, "kl": 0.01206703344359994, "learning_rate": 5.698407095104531e-07, "loss": 0.00011989485210506245, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/mean_length": 506.5625, "completions/min_length": 415.0, "epoch": 10.198529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 1.1275421380996704, "kl": 0.012251197593286633, "learning_rate": 5.697136322915217e-07, "loss": 0.00012230873107910156, "reward": 0.8035833835601807, "reward_std": 0.13181321322917938, "rewards/DrugCombAccuracyCOTORM/mean": 0.7645238041877747, "rewards/DrugCombAccuracyCOTORM/std": 0.31963616609573364, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9196428656578064, "rewards/DrugCombCoverageCOTORM/std": 0.1727626621723175, "step": 6935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 416.375, "completions/min_length": 362.0, "epoch": 10.2, "frac_reward_zero_std": 1.0, "grad_norm": 0.01889878138899803, "kl": 0.011236827122047544, "learning_rate": 5.695865504800327e-07, "loss": 0.00011244518827879801, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/mean_length": 505.0625, "completions/min_length": 399.0, "epoch": 10.201470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.2058771848678589, "kl": 0.010457086958922446, "learning_rate": 5.694594640843578e-07, "loss": 0.00010500848293304443, "reward": 0.8716250061988831, "reward_std": 0.2865436375141144, "rewards/DrugCombAccuracyCOTORM/mean": 0.8434374928474426, "rewards/DrugCombAccuracyCOTORM/std": 0.34061938524246216, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.08539126068353653, "step": 6937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/mean_length": 517.375, "completions/min_length": 443.0, "epoch": 10.202941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.508600115776062, "kl": 0.01449685962870717, "learning_rate": 5.693323731128694e-07, "loss": 0.00014474987983703613, "reward": 0.6625000238418579, "reward_std": 0.4106191098690033, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.4654746949672699, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.4654746949672699, "step": 6938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/mean_length": 500.75, "completions/min_length": 441.0, "epoch": 10.204411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.9526848196983337, "kl": 0.010183505364693701, "learning_rate": 5.692052775739395e-07, "loss": 0.00010315497638657689, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 445.3125, "completions/min_length": 402.0, "epoch": 10.205882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.012635541148483753, "kl": 0.008563579292967916, "learning_rate": 5.690781774759412e-07, "loss": 8.552390499971807e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 426.8125, "completions/min_length": 386.0, "epoch": 10.20735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.029472507536411285, "kl": 0.011740915244445205, "learning_rate": 5.689510728272472e-07, "loss": 0.0001176019140984863, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 461.25, "completions/min_length": 393.0, "epoch": 10.208823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.9205121397972107, "kl": 0.01390114426612854, "learning_rate": 5.688239636362311e-07, "loss": 0.00014106929302215576, "reward": 0.7250000238418579, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 6942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 458.6875, "completions/min_length": 417.0, "epoch": 10.21029411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.036692775785923004, "kl": 0.01318729342892766, "learning_rate": 5.686968499112665e-07, "loss": 0.00013289724302012473, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 516.375, "completions/min_length": 466.0, "epoch": 10.211764705882352, "frac_reward_zero_std": 0.5, "grad_norm": 0.9484717845916748, "kl": 0.010400533326901495, "learning_rate": 5.685697316607274e-07, "loss": 0.00010430224938318133, "reward": 0.5056999921798706, "reward_std": 0.17460967600345612, "rewards/DrugCombAccuracyCOTORM/mean": 0.39149999618530273, "rewards/DrugCombAccuracyCOTORM/std": 0.48870521783828735, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.925000011920929, "rewards/DrugCombCoverageCOTORM/std": 0.20493900775909424, "step": 6944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 472.4375, "completions/min_length": 412.0, "epoch": 10.213235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.1781120300292969, "kl": 0.012746450491249561, "learning_rate": 5.684426088929876e-07, "loss": 0.00012614578008651733, "reward": 0.9026666879653931, "reward_std": 0.2753002643585205, "rewards/DrugCombAccuracyCOTORM/mean": 0.8887500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.30663496255874634, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 6945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 450.9375, "completions/min_length": 404.0, "epoch": 10.214705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 1.4344412088394165, "kl": 0.014144502114504576, "learning_rate": 5.68315481616422e-07, "loss": 0.00014200061559677124, "reward": 0.78125, "reward_std": 0.3743184804916382, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 6946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 462.25, "completions/min_length": 398.0, "epoch": 10.216176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.4177350997924805, "kl": 0.009525900823064148, "learning_rate": 5.681883498394054e-07, "loss": 9.441615839023143e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 432.0, "completions/min_length": 380.0, "epoch": 10.217647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.010300179943442345, "kl": 0.008938069455325603, "learning_rate": 5.680612135703129e-07, "loss": 8.856621570885181e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 486.6875, "completions/min_length": 437.0, "epoch": 10.219117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.009785699658095837, "kl": 0.008800859795883298, "learning_rate": 5.679340728175199e-07, "loss": 8.817090565571561e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 452.5625, "completions/min_length": 410.0, "epoch": 10.220588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.016009362414479256, "kl": 0.00952709885314107, "learning_rate": 5.678069275894022e-07, "loss": 9.497721475781873e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 420.375, "completions/min_length": 357.0, "epoch": 10.222058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.016869112849235535, "kl": 0.011191272409632802, "learning_rate": 5.676797778943356e-07, "loss": 0.00011202989117009565, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 433.875, "completions/min_length": 345.0, "epoch": 10.223529411764705, "frac_reward_zero_std": 0.0, "grad_norm": 1.201936960220337, "kl": 0.00965597270987928, "learning_rate": 5.675526237406965e-07, "loss": 9.553134441375732e-05, "reward": 0.8589166402816772, "reward_std": 0.3013652563095093, "rewards/DrugCombAccuracyCOTORM/mean": 0.8262500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.3764195442199707, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 6952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 465.3125, "completions/min_length": 411.0, "epoch": 10.225, "frac_reward_zero_std": 1.0, "grad_norm": 0.029400233179330826, "kl": 0.012171959038823843, "learning_rate": 5.674254651368615e-07, "loss": 0.00012222578516229987, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 442.1875, "completions/min_length": 408.0, "epoch": 10.226470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.17704002559185028, "kl": 0.013794734608381987, "learning_rate": 5.672983020912076e-07, "loss": 0.0001382813643431291, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 438.125, "completions/min_length": 371.0, "epoch": 10.227941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 1.3347421884536743, "kl": 0.01367302774451673, "learning_rate": 5.671711346121116e-07, "loss": 0.00013568901340477169, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 6955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 450.3125, "completions/min_length": 359.0, "epoch": 10.229411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.1047590970993042, "kl": 0.0113736093044281, "learning_rate": 5.670439627079517e-07, "loss": 0.0001133415789809078, "reward": 0.5214166641235352, "reward_std": 0.06057547777891159, "rewards/DrugCombAccuracyCOTORM/mean": 0.5137500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.5050000548362732, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.10416668653488159, "rewards/DrugCombCoverageCOTORM/std": 1.0089874267578125, "step": 6956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 455.3125, "completions/min_length": 398.0, "epoch": 10.230882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.9201030731201172, "kl": 0.009048296138644218, "learning_rate": 5.669167863871048e-07, "loss": 8.892086043488234e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 449.125, "completions/min_length": 413.0, "epoch": 10.23235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.008215007372200489, "kl": 0.007646727841347456, "learning_rate": 5.667896056579495e-07, "loss": 7.639882824150845e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 416.75, "completions/min_length": 349.0, "epoch": 10.233823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.009233375079929829, "kl": 0.008390551898628473, "learning_rate": 5.666624205288639e-07, "loss": 8.425813575740904e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 430.5625, "completions/min_length": 367.0, "epoch": 10.235294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.0157015360891819, "kl": 0.008544362965039909, "learning_rate": 5.665352310082269e-07, "loss": 8.554144005756825e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/mean_length": 426.1875, "completions/min_length": 392.0, "epoch": 10.236764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.032725557684898376, "kl": 0.010174194816499949, "learning_rate": 5.664080371044171e-07, "loss": 0.00010087768168887123, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 502.4375, "completions/min_length": 426.0, "epoch": 10.238235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8723820447921753, "kl": 0.011077649425715208, "learning_rate": 5.66280838825814e-07, "loss": 0.00010980062506860122, "reward": 0.9178333282470703, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.9025000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.26642072200775146, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 6962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 458.0, "completions/min_length": 392.0, "epoch": 10.239705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.1608929634094238, "kl": 0.009786818409338593, "learning_rate": 5.661536361807969e-07, "loss": 9.796768426895142e-05, "reward": 0.9833333492279053, "reward_std": 0.047140445560216904, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 410.3125, "completions/min_length": 342.0, "epoch": 10.241176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.01726718060672283, "kl": 0.008971988805569708, "learning_rate": 5.660264291777456e-07, "loss": 8.941326814237982e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/mean_length": 480.375, "completions/min_length": 396.0, "epoch": 10.242647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.8541918396949768, "kl": 0.009040493401698768, "learning_rate": 5.658992178250404e-07, "loss": 8.964553126133978e-05, "reward": 0.7945833206176758, "reward_std": 0.17010116577148438, "rewards/DrugCombAccuracyCOTORM/mean": 0.7562500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.3733965754508972, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.15957117080688477, "step": 6965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 458.0625, "completions/min_length": 423.0, "epoch": 10.244117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.009785745292901993, "kl": 0.007774363388307393, "learning_rate": 5.657720021310614e-07, "loss": 7.752063538646325e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/mean_length": 469.8125, "completions/min_length": 390.0, "epoch": 10.245588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.013115421868860722, "kl": 0.009568577515892684, "learning_rate": 5.656447821041895e-07, "loss": 9.550625691190362e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 447.5625, "completions/min_length": 375.0, "epoch": 10.24705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.02347397990524769, "kl": 0.012331161182373762, "learning_rate": 5.655175577528053e-07, "loss": 0.000122969169751741, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 436.375, "completions/min_length": 395.0, "epoch": 10.248529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.8327504396438599, "kl": 0.013199809705838561, "learning_rate": 5.653903290852903e-07, "loss": 0.00013242884597275406, "reward": 0.625, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.25, "rewards/DrugCombCoverageCOTORM/std": 1.0, "step": 6969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 457.1875, "completions/min_length": 426.0, "epoch": 10.25, "frac_reward_zero_std": 1.0, "grad_norm": 0.011862804181873798, "kl": 0.009055532398633659, "learning_rate": 5.652630961100258e-07, "loss": 9.04053813428618e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 429.625, "completions/min_length": 386.0, "epoch": 10.251470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.03893950581550598, "kl": 0.010183780221268535, "learning_rate": 5.651358588353937e-07, "loss": 0.00010006918455474079, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 496.6875, "completions/min_length": 417.0, "epoch": 10.25294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.7237417101860046, "kl": 0.013307381421327591, "learning_rate": 5.650086172697761e-07, "loss": 0.00013303756713867188, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 433.4375, "completions/min_length": 389.0, "epoch": 10.254411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.01383836567401886, "kl": 0.008763477206230164, "learning_rate": 5.648813714215553e-07, "loss": 8.767560211708769e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 438.4375, "completions/min_length": 365.0, "epoch": 10.255882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.022293061017990112, "kl": 0.009846089989878237, "learning_rate": 5.647541212991141e-07, "loss": 9.926507482305169e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/mean_length": 513.25, "completions/min_length": 432.0, "epoch": 10.257352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.2015513181686401, "kl": 0.011509134201332927, "learning_rate": 5.646268669108352e-07, "loss": 0.00011554360389709473, "reward": 0.5515416860580444, "reward_std": 0.2058400809764862, "rewards/DrugCombAccuracyCOTORM/mean": 0.49281251430511475, "rewards/DrugCombAccuracyCOTORM/std": 0.4788735806941986, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5729166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.4790761172771454, "step": 6975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 429.375, "completions/min_length": 309.0, "epoch": 10.258823529411766, "frac_reward_zero_std": 0.0, "grad_norm": 1.216085433959961, "kl": 0.010213068220764399, "learning_rate": 5.644996082651016e-07, "loss": 0.00010272115468978882, "reward": 0.8653750419616699, "reward_std": 0.30422133207321167, "rewards/DrugCombAccuracyCOTORM/mean": 0.8434374928474426, "rewards/DrugCombAccuracyCOTORM/std": 0.34061938524246216, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.2561737895011902, "step": 6976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/mean_length": 554.5, "completions/min_length": 478.0, "epoch": 10.260294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.7250783443450928, "kl": 0.008238596725277603, "learning_rate": 5.643723453702973e-07, "loss": 8.182880992535502e-05, "reward": 0.9286999702453613, "reward_std": 0.145585834980011, "rewards/DrugCombAccuracyCOTORM/mean": 0.9124374985694885, "rewards/DrugCombAccuracyCOTORM/std": 0.2630749046802521, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.987500011920929, "rewards/DrugCombCoverageCOTORM/std": 0.05000000074505806, "step": 6977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 487.8125, "completions/min_length": 402.0, "epoch": 10.261764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.020969785749912262, "kl": 0.01079188333824277, "learning_rate": 5.642450782348058e-07, "loss": 0.00010832446423592046, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/mean_length": 528.625, "completions/min_length": 436.0, "epoch": 10.263235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.2534105777740479, "kl": 0.009410412632860243, "learning_rate": 5.641178068670109e-07, "loss": 9.50060784816742e-05, "reward": 0.9125000238418579, "reward_std": 0.18077215552330017, "rewards/DrugCombAccuracyCOTORM/mean": 0.90625, "rewards/DrugCombAccuracyCOTORM/std": 0.2719528079032898, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 470.0625, "completions/min_length": 368.0, "epoch": 10.264705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.2445474863052368, "kl": 0.011788364499807358, "learning_rate": 5.639905312752974e-07, "loss": 0.00011862069368362427, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/mean_length": 480.8125, "completions/min_length": 396.0, "epoch": 10.266176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.0552033185958862, "kl": 0.02111234050244093, "learning_rate": 5.638632514680495e-07, "loss": 0.00020863433019258082, "reward": 0.5958333015441895, "reward_std": 0.16660714149475098, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4583333432674408, "rewards/DrugCombCoverageCOTORM/std": 0.7781745195388794, "step": 6981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 438.25, "completions/min_length": 367.0, "epoch": 10.26764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.01067983265966177, "kl": 0.007094659493304789, "learning_rate": 5.637359674536521e-07, "loss": 7.053190347505733e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/mean_length": 494.625, "completions/min_length": 402.0, "epoch": 10.269117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.9755628108978271, "kl": 0.01189354551024735, "learning_rate": 5.636086792404907e-07, "loss": 0.00011802779044955969, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 435.0, "completions/min_length": 391.0, "epoch": 10.270588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.02877984009683132, "kl": 0.010354598169215024, "learning_rate": 5.634813868369503e-07, "loss": 0.00010349720105296001, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 468.125, "completions/min_length": 384.0, "epoch": 10.272058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.021981528028845787, "kl": 0.011001881794072688, "learning_rate": 5.633540902514169e-07, "loss": 0.00010986840061377734, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/mean_length": 518.0625, "completions/min_length": 441.0, "epoch": 10.273529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.010461026802659035, "kl": 0.00992376497015357, "learning_rate": 5.632267894922764e-07, "loss": 9.974618296837434e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 490.6875, "completions/min_length": 412.0, "epoch": 10.275, "frac_reward_zero_std": 0.0, "grad_norm": 1.7126013040542603, "kl": 0.01589795621111989, "learning_rate": 5.630994845679149e-07, "loss": 0.00015878677368164062, "reward": 0.5625, "reward_std": 0.3934735357761383, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 6987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 443.1875, "completions/min_length": 386.0, "epoch": 10.276470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.9923280477523804, "kl": 0.015942707657814026, "learning_rate": 5.629721754867191e-07, "loss": 0.00015708396676927805, "reward": 0.8356666564941406, "reward_std": 0.17567972838878632, "rewards/DrugCombAccuracyCOTORM/mean": 0.8050000071525574, "rewards/DrugCombAccuracyCOTORM/std": 0.3488266170024872, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.14907118678092957, "step": 6988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/mean_length": 419.375, "completions/min_length": 386.0, "epoch": 10.277941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.1846608817577362, "kl": 0.013974484289065003, "learning_rate": 5.628448622570757e-07, "loss": 0.0001390443358104676, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 413.875, "completions/min_length": 366.0, "epoch": 10.279411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.017457112669944763, "kl": 0.008231194922700524, "learning_rate": 5.627175448873719e-07, "loss": 8.191496453946456e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 425.3125, "completions/min_length": 372.0, "epoch": 10.280882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.9601131081581116, "kl": 0.01061708212364465, "learning_rate": 5.625902233859948e-07, "loss": 0.00010739538993220776, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 497.3125, "completions/min_length": 388.0, "epoch": 10.282352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.821470320224762, "kl": 0.01670328015461564, "learning_rate": 5.624628977613324e-07, "loss": 0.00016785546904429793, "reward": 0.800000011920929, "reward_std": 0.12344269454479218, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.3333333432674408, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 459.625, "completions/min_length": 385.0, "epoch": 10.283823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 1.3316962718963623, "kl": 0.01383323222398758, "learning_rate": 5.623355680217723e-07, "loss": 0.0001392662525177002, "reward": 0.42500001192092896, "reward_std": 0.37192288041114807, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 6993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 498.5625, "completions/min_length": 450.0, "epoch": 10.285294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.237167239189148, "kl": 0.011728896060958505, "learning_rate": 5.622082341757026e-07, "loss": 0.00011813640594482422, "reward": 0.5517083406448364, "reward_std": 0.30608493089675903, "rewards/DrugCombAccuracyCOTORM/mean": 0.4591667056083679, "rewards/DrugCombAccuracyCOTORM/std": 0.38443607091903687, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.1663190871477127, "step": 6994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 420.0625, "completions/min_length": 375.0, "epoch": 10.286764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.017951270565390587, "kl": 0.008917519240640104, "learning_rate": 5.62080896231512e-07, "loss": 8.896010695025325e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 444.8125, "completions/min_length": 354.0, "epoch": 10.288235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.010825871489942074, "kl": 0.009150200756266713, "learning_rate": 5.619535541975889e-07, "loss": 9.170779958367348e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 441.3125, "completions/min_length": 392.0, "epoch": 10.28970588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.009009579196572304, "kl": 0.008559504174627364, "learning_rate": 5.618262080823227e-07, "loss": 8.541916031390429e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 432.5, "completions/min_length": 376.0, "epoch": 10.291176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.020285483449697495, "kl": 0.00875962374266237, "learning_rate": 5.616988578941022e-07, "loss": 8.829178841551766e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 6998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/mean_length": 458.9375, "completions/min_length": 357.0, "epoch": 10.29264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0334053039550781, "kl": 0.012690517585724592, "learning_rate": 5.615715036413171e-07, "loss": 0.00012506748316809535, "reward": 0.8805000185966492, "reward_std": 0.07378603518009186, "rewards/DrugCombAccuracyCOTORM/mean": 0.8714583516120911, "rewards/DrugCombAccuracyCOTORM/std": 0.17190854251384735, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.2357022613286972, "step": 6999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 442.1875, "completions/min_length": 400.0, "epoch": 10.294117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.013023571111261845, "kl": 0.008996410295367241, "learning_rate": 5.614441453323571e-07, "loss": 8.950526535045356e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 7000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/mean_length": 387.3125, "completions/min_length": 332.0, "epoch": 10.295588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.013856912963092327, "kl": 0.011661443626508117, "learning_rate": 5.613167829756124e-07, "loss": 0.00011747189273592085, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 493.875, "completions/min_length": 440.0, "epoch": 10.297058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.3651578426361084, "kl": 0.013055386487394571, "learning_rate": 5.611894165794731e-07, "loss": 0.00013193488121032715, "reward": 0.6802083253860474, "reward_std": 0.3514903485774994, "rewards/DrugCombAccuracyCOTORM/mean": 0.6041666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.4901813864707947, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 7002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 447.3125, "completions/min_length": 397.0, "epoch": 10.298529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.2258647680282593, "kl": 0.008224036428146064, "learning_rate": 5.6106204615233e-07, "loss": 8.213641558540985e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/mean_length": 501.25, "completions/min_length": 399.0, "epoch": 10.3, "frac_reward_zero_std": 0.5, "grad_norm": 0.8657433390617371, "kl": 0.007977374945767224, "learning_rate": 5.609346717025737e-07, "loss": 7.994934276212007e-05, "reward": 0.8379166722297668, "reward_std": 0.22012217342853546, "rewards/DrugCombAccuracyCOTORM/mean": 0.831250011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.34970226883888245, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7291666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.6800735592842102, "step": 7004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 448.25, "completions/min_length": 414.0, "epoch": 10.301470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.010330203920602798, "kl": 0.010409969137981534, "learning_rate": 5.608072932385955e-07, "loss": 0.00010405783541500568, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 457.75, "completions/min_length": 383.0, "epoch": 10.302941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.022112764418125153, "kl": 0.009180495282635093, "learning_rate": 5.606799107687867e-07, "loss": 9.269791189581156e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 7006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/mean_length": 485.625, "completions/min_length": 406.0, "epoch": 10.304411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.8029939532279968, "kl": 0.0115428336430341, "learning_rate": 5.60552524301539e-07, "loss": 0.00011607960186665878, "reward": 0.7512999773025513, "reward_std": 0.15362104773521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.7047500014305115, "rewards/DrugCombAccuracyCOTORM/std": 0.39380326867103577, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.1666666567325592, "step": 7007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 403.4375, "completions/min_length": 331.0, "epoch": 10.305882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.016776280477643013, "kl": 0.012758864322677255, "learning_rate": 5.604251338452443e-07, "loss": 0.00012800034892279655, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 430.5, "completions/min_length": 378.0, "epoch": 10.30735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.017721429467201233, "kl": 0.01063840277493, "learning_rate": 5.602977394082946e-07, "loss": 0.00010713985102484003, "reward": 0.6713333129882812, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6100000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.40279027819633484, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 7009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/mean_length": 490.4375, "completions/min_length": 440.0, "epoch": 10.308823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.7958807349205017, "kl": 0.010602634167298675, "learning_rate": 5.601703409990824e-07, "loss": 0.000105246901512146, "reward": 0.644058346748352, "reward_std": 0.008650270290672779, "rewards/DrugCombAccuracyCOTORM/mean": 0.5850208401679993, "rewards/DrugCombAccuracyCOTORM/std": 0.4286993443965912, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7604166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.25069350004196167, "step": 7010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 480.8125, "completions/min_length": 409.0, "epoch": 10.310294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.3724805116653442, "kl": 0.012862324481830001, "learning_rate": 5.600429386260004e-07, "loss": 0.00012893974781036377, "reward": 0.581250011920929, "reward_std": 0.43991678953170776, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 7011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 479.6875, "completions/min_length": 444.0, "epoch": 10.311764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.00821685791015625, "kl": 0.007459612796083093, "learning_rate": 5.599155322974416e-07, "loss": 7.488244591513649e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 431.9375, "completions/min_length": 367.0, "epoch": 10.313235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.01951202005147934, "kl": 0.007376260124146938, "learning_rate": 5.597881220217991e-07, "loss": 7.318289135582745e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 455.0625, "completions/min_length": 393.0, "epoch": 10.314705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.7374808192253113, "kl": 0.010204006917774677, "learning_rate": 5.596607078074665e-07, "loss": 0.0001025486271828413, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 475.75, "completions/min_length": 444.0, "epoch": 10.316176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.9874827861785889, "kl": 0.01214581960812211, "learning_rate": 5.595332896628374e-07, "loss": 0.0001214146614074707, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/mean_length": 541.1875, "completions/min_length": 442.0, "epoch": 10.31764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.2183829545974731, "kl": 0.011576090939342976, "learning_rate": 5.594058675963059e-07, "loss": 0.00011584162712097168, "reward": 0.48750001192092896, "reward_std": 0.357613742351532, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 452.25, "completions/min_length": 380.0, "epoch": 10.319117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.03216884657740593, "kl": 0.009276500204578042, "learning_rate": 5.592784416162662e-07, "loss": 9.155960287898779e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 466.9375, "completions/min_length": 415.0, "epoch": 10.320588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.9729260206222534, "kl": 0.00980912521481514, "learning_rate": 5.591510117311127e-07, "loss": 9.801783744478598e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 410.8125, "completions/min_length": 370.0, "epoch": 10.322058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.018277129158377647, "kl": 0.00964182906318456, "learning_rate": 5.590235779492401e-07, "loss": 9.645840327721089e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 459.3125, "completions/min_length": 380.0, "epoch": 10.323529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 0.7781681418418884, "kl": 0.009950088569894433, "learning_rate": 5.588961402790438e-07, "loss": 9.916516864905134e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 419.125, "completions/min_length": 345.0, "epoch": 10.325, "frac_reward_zero_std": 0.0, "grad_norm": 1.2341594696044922, "kl": 0.011772522469982505, "learning_rate": 5.587686987289189e-07, "loss": 0.00011791661381721497, "reward": 0.6499999761581421, "reward_std": 0.3921346068382263, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 7021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 491.9375, "completions/min_length": 402.0, "epoch": 10.326470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.017392367124557495, "kl": 0.01126985577866435, "learning_rate": 5.586412533072606e-07, "loss": 0.0001130572272813879, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/mean_length": 588.625, "completions/min_length": 529.0, "epoch": 10.327941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 0.9954904913902283, "kl": 0.009084193501621485, "learning_rate": 5.58513804022465e-07, "loss": 9.085237979888916e-05, "reward": 0.6767416596412659, "reward_std": 0.3492279052734375, "rewards/DrugCombAccuracyCOTORM/mean": 0.6066041588783264, "rewards/DrugCombAccuracyCOTORM/std": 0.44814789295196533, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9145833253860474, "rewards/DrugCombCoverageCOTORM/std": 0.1623753011226654, "step": 7023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 498.25, "completions/min_length": 407.0, "epoch": 10.329411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 1.4631482362747192, "kl": 0.012552216649055481, "learning_rate": 5.58386350882928e-07, "loss": 0.00012601539492607117, "reward": 0.6500000357627869, "reward_std": 0.4208287000656128, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 473.0, "completions/min_length": 392.0, "epoch": 10.330882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.9096019268035889, "kl": 0.011835884302854538, "learning_rate": 5.582588938970462e-07, "loss": 0.00011832072050310671, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 422.875, "completions/min_length": 331.0, "epoch": 10.33235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.015230175107717514, "kl": 0.010407131863757968, "learning_rate": 5.581314330732158e-07, "loss": 0.00010334471880923957, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/mean_length": 476.9375, "completions/min_length": 375.0, "epoch": 10.333823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.0277541875839233, "kl": 0.010482576442882419, "learning_rate": 5.580039684198337e-07, "loss": 0.00010359591396991163, "reward": 0.824999988079071, "reward_std": 0.24348658323287964, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 7027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 448.4375, "completions/min_length": 386.0, "epoch": 10.33529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.013763997703790665, "kl": 0.009084461024031043, "learning_rate": 5.578764999452969e-07, "loss": 9.075046546058729e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 452.9375, "completions/min_length": 409.0, "epoch": 10.336764705882352, "frac_reward_zero_std": 1.0, "grad_norm": 0.012232293374836445, "kl": 0.00677027681376785, "learning_rate": 5.577490276580027e-07, "loss": 6.790866609662771e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 7029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 409.3125, "completions/min_length": 340.0, "epoch": 10.338235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.010480453260242939, "kl": 0.00919846713077277, "learning_rate": 5.576215515663488e-07, "loss": 9.179602056974545e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 487.0, "completions/min_length": 419.0, "epoch": 10.339705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.009103122167289257, "kl": 0.007759053143672645, "learning_rate": 5.574940716787328e-07, "loss": 7.702332368353382e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 448.25, "completions/min_length": 386.0, "epoch": 10.341176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.8104240894317627, "kl": 0.011735080042853951, "learning_rate": 5.573665880035531e-07, "loss": 0.0001164320347015746, "reward": 0.7945833206176758, "reward_std": 0.17010116577148438, "rewards/DrugCombAccuracyCOTORM/mean": 0.7562500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.3733965754508972, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.15957117080688477, "step": 7032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 464.25, "completions/min_length": 376.0, "epoch": 10.342647058823529, "frac_reward_zero_std": 0.0, "grad_norm": 1.2620123624801636, "kl": 0.01125098136253655, "learning_rate": 5.572391005492075e-07, "loss": 0.00011183694005012512, "reward": 0.710812509059906, "reward_std": 0.3178586959838867, "rewards/DrugCombAccuracyCOTORM/mean": 0.6404687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.48291152715682983, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.0625, "step": 7033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 427.25, "completions/min_length": 378.0, "epoch": 10.344117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.063808798789978, "kl": 0.009748576674610376, "learning_rate": 5.571116093240949e-07, "loss": 9.699308429844677e-05, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 7034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 422.25, "completions/min_length": 346.0, "epoch": 10.345588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.04457687586545944, "kl": 0.008859004476107657, "learning_rate": 5.569841143366141e-07, "loss": 8.86614725459367e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/mean_length": 392.0, "completions/min_length": 356.0, "epoch": 10.347058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.2189126014709473, "kl": 0.011330104433000088, "learning_rate": 5.568566155951639e-07, "loss": 0.00011478827946120873, "reward": 0.9178333282470703, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.9025000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.26642072200775146, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 7036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 456.6875, "completions/min_length": 417.0, "epoch": 10.348529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.027740240097046, "kl": 0.012379694730043411, "learning_rate": 5.567291131081437e-07, "loss": 0.00012349573080427945, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 7037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 445.875, "completions/min_length": 331.0, "epoch": 10.35, "frac_reward_zero_std": 0.5, "grad_norm": 1.3132119178771973, "kl": 0.01326693082228303, "learning_rate": 5.566016068839534e-07, "loss": 0.00013293299707584083, "reward": 0.640791654586792, "reward_std": 0.08676879853010178, "rewards/DrugCombAccuracyCOTORM/mean": 0.5887500047683716, "rewards/DrugCombAccuracyCOTORM/std": 0.44589799642562866, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6979166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.356000155210495, "step": 7038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 474.125, "completions/min_length": 392.0, "epoch": 10.351470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.022204332053661346, "kl": 0.007596733164973557, "learning_rate": 5.564740969309923e-07, "loss": 7.55997170927003e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 432.3125, "completions/min_length": 367.0, "epoch": 10.352941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.02301577851176262, "kl": 0.011137397959828377, "learning_rate": 5.563465832576606e-07, "loss": 0.00011123434524051845, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 450.75, "completions/min_length": 406.0, "epoch": 10.354411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9315801858901978, "kl": 0.010077668586745858, "learning_rate": 5.562190658723586e-07, "loss": 0.00010180473327636719, "reward": 0.987500011920929, "reward_std": 0.0353553406894207, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/mean_length": 493.6875, "completions/min_length": 388.0, "epoch": 10.355882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.008414057083427906, "kl": 0.007281175698153675, "learning_rate": 5.560915447834867e-07, "loss": 7.248609472298995e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 435.375, "completions/min_length": 391.0, "epoch": 10.35735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.090611219406128, "kl": 0.00763941160403192, "learning_rate": 5.559640199994459e-07, "loss": 7.57947564125061e-05, "reward": 0.9937499761581421, "reward_std": 0.017677659168839455, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 471.125, "completions/min_length": 422.0, "epoch": 10.358823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.028714962303638458, "kl": 0.01466606417670846, "learning_rate": 5.558364915286372e-07, "loss": 0.00014575013483408839, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 462.8125, "completions/min_length": 405.0, "epoch": 10.360294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.010374066419899464, "kl": 0.00850767083466053, "learning_rate": 5.557089593794616e-07, "loss": 8.508226164849475e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 434.4375, "completions/min_length": 383.0, "epoch": 10.361764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.010441119782626629, "kl": 0.00854458985850215, "learning_rate": 5.555814235603206e-07, "loss": 8.541422721464187e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 456.9375, "completions/min_length": 420.0, "epoch": 10.363235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.2540842294692993, "kl": 0.009224487002938986, "learning_rate": 5.554538840796164e-07, "loss": 9.231269359588623e-05, "reward": 0.7562500238418579, "reward_std": 0.4178736209869385, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.8139410614967346, "step": 7047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 437.4375, "completions/min_length": 347.0, "epoch": 10.364705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.022270074114203453, "kl": 0.008066378417424858, "learning_rate": 5.553263409457503e-07, "loss": 8.162031008396298e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 463.875, "completions/min_length": 401.0, "epoch": 10.366176470588234, "frac_reward_zero_std": 0.0, "grad_norm": 1.0296956300735474, "kl": 0.007892352412454784, "learning_rate": 5.551987941671251e-07, "loss": 7.892400026321411e-05, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 7049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 445.875, "completions/min_length": 390.0, "epoch": 10.367647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.01808677799999714, "kl": 0.008977094548754394, "learning_rate": 5.550712437521431e-07, "loss": 8.864393748808652e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/mean_length": 468.3125, "completions/min_length": 304.0, "epoch": 10.369117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 1.5787192583084106, "kl": 0.01415926218032837, "learning_rate": 5.549436897092068e-07, "loss": 0.0001406744122505188, "reward": 0.6527832746505737, "reward_std": 0.26954519748687744, "rewards/DrugCombAccuracyCOTORM/mean": 0.5852500200271606, "rewards/DrugCombAccuracyCOTORM/std": 0.4351787269115448, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8458333015441895, "rewards/DrugCombCoverageCOTORM/std": 0.3124277889728546, "step": 7051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 448.3125, "completions/min_length": 383.0, "epoch": 10.370588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.015451516956090927, "kl": 0.009092719992622733, "learning_rate": 5.548161320467193e-07, "loss": 9.123415657086298e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/mean_length": 507.6875, "completions/min_length": 430.0, "epoch": 10.37205882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9924771785736084, "kl": 0.010022270842455328, "learning_rate": 5.546885707730837e-07, "loss": 0.00010066630784422159, "reward": 0.9037333726882935, "reward_std": 0.10307221114635468, "rewards/DrugCombAccuracyCOTORM/mean": 0.8874791860580444, "rewards/DrugCombAccuracyCOTORM/std": 0.20438936352729797, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 471.0625, "completions/min_length": 413.0, "epoch": 10.373529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.8782495856285095, "kl": 0.011565720778889954, "learning_rate": 5.545610058967034e-07, "loss": 0.00011447161523392424, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/mean_length": 492.875, "completions/min_length": 399.0, "epoch": 10.375, "frac_reward_zero_std": 1.0, "grad_norm": 0.012409958988428116, "kl": 0.007817449164576828, "learning_rate": 5.544334374259823e-07, "loss": 7.768376963213086e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 435.5625, "completions/min_length": 387.0, "epoch": 10.376470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.011376142501831, "kl": 0.01258015539497137, "learning_rate": 5.54305865369324e-07, "loss": 0.00012642535148188472, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 460.5, "completions/min_length": 419.0, "epoch": 10.37794117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.025131138041615486, "kl": 0.009452908649109304, "learning_rate": 5.541782897351327e-07, "loss": 9.400307317264378e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 427.375, "completions/min_length": 382.0, "epoch": 10.379411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.014518805779516697, "kl": 0.009238717844709754, "learning_rate": 5.540507105318128e-07, "loss": 9.258993668481708e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 414.6875, "completions/min_length": 372.0, "epoch": 10.380882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.010950703173875809, "kl": 0.008176350267603993, "learning_rate": 5.539231277677688e-07, "loss": 8.236082794610411e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 431.75, "completions/min_length": 388.0, "epoch": 10.382352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.14915329217910767, "kl": 0.012490150751546025, "learning_rate": 5.537955414514057e-07, "loss": 0.00012184833758510649, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 493.375, "completions/min_length": 423.0, "epoch": 10.383823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 0.8542504906654358, "kl": 0.012808728963136673, "learning_rate": 5.536679515911284e-07, "loss": 0.00012841727584600449, "reward": 0.5052083730697632, "reward_std": 0.10225021839141846, "rewards/DrugCombAccuracyCOTORM/mean": 0.3854166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.43341347575187683, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 7061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 433.375, "completions/min_length": 356.0, "epoch": 10.385294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.752309262752533, "kl": 0.009131516329944134, "learning_rate": 5.535403581953424e-07, "loss": 9.188801050186157e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 449.125, "completions/min_length": 378.0, "epoch": 10.386764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8212125897407532, "kl": 0.010337254498153925, "learning_rate": 5.53412761272453e-07, "loss": 0.00010370100062573329, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 445.4375, "completions/min_length": 381.0, "epoch": 10.388235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0332136154174805, "kl": 0.011634867638349533, "learning_rate": 5.532851608308661e-07, "loss": 0.00011593103408813477, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 415.6875, "completions/min_length": 305.0, "epoch": 10.389705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.0030113458633423, "kl": 0.00900655915029347, "learning_rate": 5.531575568789876e-07, "loss": 9.049312939168885e-05, "reward": 0.9513333439826965, "reward_std": 0.13765011727809906, "rewards/DrugCombAccuracyCOTORM/mean": 0.9443750381469727, "rewards/DrugCombAccuracyCOTORM/std": 0.2224999964237213, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.1666666567325592, "step": 7065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 481.1875, "completions/min_length": 379.0, "epoch": 10.391176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.07150402665138245, "kl": 0.013861532090231776, "learning_rate": 5.530299494252237e-07, "loss": 0.0001379265304422006, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 452.0, "completions/min_length": 415.0, "epoch": 10.39264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.1152927875518799, "kl": 0.009100038441829383, "learning_rate": 5.52902338477981e-07, "loss": 9.200962085742503e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 433.375, "completions/min_length": 377.0, "epoch": 10.394117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.0896340608596802, "kl": 0.011672105174511671, "learning_rate": 5.527747240456662e-07, "loss": 0.00011619592987699434, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/mean_length": 490.4375, "completions/min_length": 397.0, "epoch": 10.395588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.014475616626441479, "kl": 0.010174401802942157, "learning_rate": 5.526471061366861e-07, "loss": 0.00010153664334211498, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 7069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 520.0625, "completions/min_length": 470.0, "epoch": 10.397058823529411, "frac_reward_zero_std": 0.0, "grad_norm": 1.1028081178665161, "kl": 0.00973820430226624, "learning_rate": 5.525194847594479e-07, "loss": 9.682774543762207e-05, "reward": 0.8654166460037231, "reward_std": 0.1755981594324112, "rewards/DrugCombAccuracyCOTORM/mean": 0.8500000238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.24765567481517792, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.17078250646591187, "step": 7070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 455.3125, "completions/min_length": 419.0, "epoch": 10.398529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.00931590050458908, "kl": 0.007598979282192886, "learning_rate": 5.523918599223589e-07, "loss": 7.59049435146153e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 435.625, "completions/min_length": 372.0, "epoch": 10.4, "frac_reward_zero_std": 1.0, "grad_norm": 0.015126579441130161, "kl": 0.010796994203701615, "learning_rate": 5.522642316338268e-07, "loss": 0.00010763109457911924, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 472.8125, "completions/min_length": 444.0, "epoch": 10.401470588235295, "frac_reward_zero_std": 0.0, "grad_norm": 1.7997852563858032, "kl": 0.012186557753011584, "learning_rate": 5.521365999022592e-07, "loss": 0.00012119114398956299, "reward": 0.84375, "reward_std": 0.3442630469799042, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 471.625, "completions/min_length": 417.0, "epoch": 10.402941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.8989962339401245, "kl": 0.02082503936253488, "learning_rate": 5.520089647360646e-07, "loss": 0.00021636812016367912, "reward": 0.8767499923706055, "reward_std": 0.17010116577148438, "rewards/DrugCombAccuracyCOTORM/mean": 0.8537499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.31442803144454956, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.13437095284461975, "step": 7074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 436.6875, "completions/min_length": 388.0, "epoch": 10.404411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.0971784591674805, "kl": 0.015502044698223472, "learning_rate": 5.518813261436509e-07, "loss": 0.00015432454529218376, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 448.25, "completions/min_length": 397.0, "epoch": 10.405882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.01708843931555748, "kl": 0.007870915695093572, "learning_rate": 5.517536841334267e-07, "loss": 7.828077650628984e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 412.9375, "completions/min_length": 367.0, "epoch": 10.407352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.012110989540815353, "kl": 0.007961682975292206, "learning_rate": 5.516260387138009e-07, "loss": 7.96195090515539e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/mean_length": 389.5, "completions/min_length": 332.0, "epoch": 10.408823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.4440085887908936, "kl": 0.013052746653556824, "learning_rate": 5.514983898931822e-07, "loss": 0.00013016164302825928, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 7078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 413.125, "completions/min_length": 364.0, "epoch": 10.410294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.014757928438484669, "kl": 0.007853592047467828, "learning_rate": 5.513707376799799e-07, "loss": 7.82731658546254e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/mean_length": 470.125, "completions/min_length": 375.0, "epoch": 10.411764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.281387209892273, "kl": 0.012282742653042078, "learning_rate": 5.512430820826035e-07, "loss": 0.0001211825801874511, "reward": 0.7589166760444641, "reward_std": 0.20108719170093536, "rewards/DrugCombAccuracyCOTORM/mean": 0.7012500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.46046173572540283, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 7080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 727.0, "completions/mean_length": 533.75, "completions/min_length": 379.0, "epoch": 10.413235294117648, "frac_reward_zero_std": 0.5, "grad_norm": 0.699753999710083, "kl": 0.006871403660625219, "learning_rate": 5.511154231094624e-07, "loss": 6.887653580633923e-05, "reward": 0.7977148294448853, "reward_std": 0.14123976230621338, "rewards/DrugCombAccuracyCOTORM/mean": 0.76068514585495, "rewards/DrugCombAccuracyCOTORM/std": 0.33624258637428284, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8916666507720947, "rewards/DrugCombCoverageCOTORM/std": 0.19944366812705994, "step": 7081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/mean_length": 505.5, "completions/min_length": 446.0, "epoch": 10.41470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.7406282424926758, "kl": 0.009214513702318072, "learning_rate": 5.509877607689666e-07, "loss": 9.138778841588646e-05, "reward": 0.9862916469573975, "reward_std": 0.03877301141619682, "rewards/DrugCombAccuracyCOTORM/mean": 0.98416668176651, "rewards/DrugCombAccuracyCOTORM/std": 0.06333333253860474, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9895833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.041666675359010696, "step": 7082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 446.125, "completions/min_length": 381.0, "epoch": 10.416176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.7819840312004089, "kl": 0.009017574600875378, "learning_rate": 5.508600950695262e-07, "loss": 9.047480125445873e-05, "reward": 0.8561667203903198, "reward_std": 0.028292693197727203, "rewards/DrugCombAccuracyCOTORM/mean": 0.8306249976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.18148040771484375, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.08606630563735962, "step": 7083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 442.6875, "completions/min_length": 391.0, "epoch": 10.41764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.7627806663513184, "kl": 0.008103388827294111, "learning_rate": 5.507324260195515e-07, "loss": 8.195021655410528e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 414.125, "completions/min_length": 360.0, "epoch": 10.419117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.8702510595321655, "kl": 0.011395820882171392, "learning_rate": 5.506047536274528e-07, "loss": 0.00011344254016876221, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 7085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 419.3125, "completions/min_length": 377.0, "epoch": 10.420588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.009418503381311893, "kl": 0.008346682996489108, "learning_rate": 5.504770779016412e-07, "loss": 8.30217613838613e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 425.875, "completions/min_length": 395.0, "epoch": 10.422058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.009000816382467747, "kl": 0.00734050408937037, "learning_rate": 5.503493988505275e-07, "loss": 7.37830123398453e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 454.9375, "completions/min_length": 345.0, "epoch": 10.423529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9759506583213806, "kl": 0.0089049810776487, "learning_rate": 5.502217164825226e-07, "loss": 8.823963435133919e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 493.25, "completions/min_length": 456.0, "epoch": 10.425, "frac_reward_zero_std": 0.5, "grad_norm": 0.8527883291244507, "kl": 0.008993449446279556, "learning_rate": 5.500940308060381e-07, "loss": 8.997557597467676e-05, "reward": 0.684374988079071, "reward_std": 0.1977270543575287, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.5072392821311951, "step": 7089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/mean_length": 487.75, "completions/min_length": 392.0, "epoch": 10.426470588235293, "frac_reward_zero_std": 0.0, "grad_norm": 1.7392449378967285, "kl": 0.015578605700284243, "learning_rate": 5.499663418294857e-07, "loss": 0.00015462934970855713, "reward": 0.32083332538604736, "reward_std": 0.3807573914527893, "rewards/DrugCombAccuracyCOTORM/mean": 0.2291666716337204, "rewards/DrugCombAccuracyCOTORM/std": 0.4166666567325592, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 7090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 445.875, "completions/min_length": 377.0, "epoch": 10.427941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.8575859069824219, "kl": 0.00859449349809438, "learning_rate": 5.498386495612771e-07, "loss": 8.56334954733029e-05, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 7091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 460.8125, "completions/min_length": 387.0, "epoch": 10.429411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.0074851512908936, "kl": 0.01486921519972384, "learning_rate": 5.497109540098246e-07, "loss": 0.00015230840654112399, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 443.3125, "completions/min_length": 418.0, "epoch": 10.430882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.014008582569658756, "kl": 0.00826035172212869, "learning_rate": 5.495832551835401e-07, "loss": 8.278593304567039e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 445.1875, "completions/min_length": 400.0, "epoch": 10.43235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.009126028046011925, "kl": 0.006937185185961425, "learning_rate": 5.494555530908362e-07, "loss": 6.919664883753285e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 465.625, "completions/min_length": 395.0, "epoch": 10.433823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.015071281231939793, "kl": 0.013442042516544461, "learning_rate": 5.493278477401256e-07, "loss": 0.0001331455132458359, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 445.25, "completions/min_length": 369.0, "epoch": 10.435294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9650440216064453, "kl": 0.011159285088069737, "learning_rate": 5.492001391398214e-07, "loss": 0.00011140358401462436, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 428.5625, "completions/min_length": 393.0, "epoch": 10.436764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 0.9097187519073486, "kl": 0.008357161656022072, "learning_rate": 5.490724272983363e-07, "loss": 8.336454629898071e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 480.125, "completions/min_length": 434.0, "epoch": 10.438235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 0.9366650581359863, "kl": 0.011711573577485979, "learning_rate": 5.489447122240841e-07, "loss": 0.00011727646051440388, "reward": 0.921625018119812, "reward_std": 0.14512230455875397, "rewards/DrugCombAccuracyCOTORM/mean": 0.9059374928474426, "rewards/DrugCombAccuracyCOTORM/std": 0.25702768564224243, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.08539126068353653, "step": 7098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/mean_length": 463.875, "completions/min_length": 403.0, "epoch": 10.439705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9706173539161682, "kl": 0.010392613592557609, "learning_rate": 5.488169939254781e-07, "loss": 0.00010403245687484741, "reward": 0.824999988079071, "reward_std": 0.24348658323287964, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 7099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 447.5, "completions/min_length": 392.0, "epoch": 10.441176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.8968279957771301, "kl": 0.010488627012819052, "learning_rate": 5.486892724109319e-07, "loss": 0.000104717597423587, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 438.125, "completions/min_length": 370.0, "epoch": 10.44264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0666950941085815, "kl": 0.010885002091526985, "learning_rate": 5.485615476888598e-07, "loss": 0.00010779061267385259, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 7101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 520.375, "completions/min_length": 417.0, "epoch": 10.444117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.7554054856300354, "kl": 0.009123921161517501, "learning_rate": 5.484338197676757e-07, "loss": 9.088590741157532e-05, "reward": 0.9664881229400635, "reward_std": 0.038090065121650696, "rewards/DrugCombAccuracyCOTORM/mean": 0.9607142806053162, "rewards/DrugCombAccuracyCOTORM/std": 0.0714285671710968, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 7102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 444.3125, "completions/min_length": 412.0, "epoch": 10.445588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.014356005005538464, "kl": 0.009698937647044659, "learning_rate": 5.483060886557942e-07, "loss": 9.671837324276567e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 448.4375, "completions/min_length": 367.0, "epoch": 10.447058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.8714278936386108, "kl": 0.015947638545185328, "learning_rate": 5.4817835436163e-07, "loss": 0.00016027687524911016, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 448.5, "completions/min_length": 399.0, "epoch": 10.448529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 0.9635791778564453, "kl": 0.0085532400989905, "learning_rate": 5.480506168935974e-07, "loss": 8.423253893852234e-05, "reward": 0.810699999332428, "reward_std": 0.20496807992458344, "rewards/DrugCombAccuracyCOTORM/mean": 0.7664999961853027, "rewards/DrugCombAccuracyCOTORM/std": 0.421848326921463, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9750000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.06831300258636475, "step": 7105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 442.5, "completions/min_length": 371.0, "epoch": 10.45, "frac_reward_zero_std": 0.5, "grad_norm": 1.115410327911377, "kl": 0.008715640404261649, "learning_rate": 5.47922876260112e-07, "loss": 8.690356480656192e-05, "reward": 0.7320833206176758, "reward_std": 0.17561084032058716, "rewards/DrugCombAccuracyCOTORM/mean": 0.6937500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.41161268949508667, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7708333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.49767982959747314, "step": 7106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 492.25, "completions/min_length": 421.0, "epoch": 10.451470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8161710500717163, "kl": 0.014439358725212514, "learning_rate": 5.477951324695887e-07, "loss": 0.00014606863260269165, "reward": 0.703374981880188, "reward_std": 0.15036405622959137, "rewards/DrugCombAccuracyCOTORM/mean": 0.659166693687439, "rewards/DrugCombAccuracyCOTORM/std": 0.42316532135009766, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7604166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.3440970480442047, "step": 7107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 474.0, "completions/min_length": 436.0, "epoch": 10.452941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9820120334625244, "kl": 0.007552412454970181, "learning_rate": 5.47667385530443e-07, "loss": 7.574260234832764e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 475.0, "completions/min_length": 449.0, "epoch": 10.454411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 1.2132295370101929, "kl": 0.012405107729136944, "learning_rate": 5.475396354510906e-07, "loss": 0.00012484565377235413, "reward": 0.7312500476837158, "reward_std": 0.41806113719940186, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 7109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/mean_length": 526.5, "completions/min_length": 452.0, "epoch": 10.455882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.8742762207984924, "kl": 0.009072207030840218, "learning_rate": 5.474118822399475e-07, "loss": 9.145587682723999e-05, "reward": 0.909375011920929, "reward_std": 0.1831946223974228, "rewards/DrugCombAccuracyCOTORM/mean": 0.90625, "rewards/DrugCombAccuracyCOTORM/std": 0.2719528079032898, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.5072392821311951, "step": 7110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 450.6875, "completions/min_length": 372.0, "epoch": 10.45735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0686886310577393, "kl": 0.009064912679605186, "learning_rate": 5.472841259054294e-07, "loss": 9.055653208633885e-05, "reward": 0.9552083015441895, "reward_std": 0.08368229866027832, "rewards/DrugCombAccuracyCOTORM/mean": 0.9479166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.145535409450531, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 7111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 423.0, "completions/min_length": 378.0, "epoch": 10.458823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.033502548933029175, "kl": 0.008913200348615646, "learning_rate": 5.471563664559528e-07, "loss": 8.933329081628472e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 483.6875, "completions/min_length": 405.0, "epoch": 10.46029411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.0755667686462402, "kl": 0.012636416591703892, "learning_rate": 5.470286038999341e-07, "loss": 0.00012731888273265213, "reward": 0.6775000095367432, "reward_std": 0.24621419608592987, "rewards/DrugCombAccuracyCOTORM/mean": 0.675000011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.47258156538009644, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 7113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/mean_length": 516.375, "completions/min_length": 392.0, "epoch": 10.461764705882352, "frac_reward_zero_std": 0.5, "grad_norm": 0.9050169587135315, "kl": 0.008756585069932044, "learning_rate": 5.469008382457899e-07, "loss": 8.746981620788574e-05, "reward": 0.8444681167602539, "reward_std": 0.16680924594402313, "rewards/DrugCombAccuracyCOTORM/mean": 0.8102726936340332, "rewards/DrugCombAccuracyCOTORM/std": 0.3469749987125397, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9624999761581421, "rewards/DrugCombCoverageCOTORM/std": 0.10246951878070831, "step": 7114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 460.6875, "completions/min_length": 387.0, "epoch": 10.463235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.010827699676156044, "kl": 0.008954633143730462, "learning_rate": 5.467730695019373e-07, "loss": 8.979396807262674e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/mean_length": 495.0625, "completions/min_length": 416.0, "epoch": 10.464705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9178877472877502, "kl": 0.010948491981253028, "learning_rate": 5.466452976767933e-07, "loss": 0.00010880082845687866, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 7116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 486.0, "completions/min_length": 437.0, "epoch": 10.466176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.9935922622680664, "kl": 0.009970783954486251, "learning_rate": 5.465175227787749e-07, "loss": 9.934976696968079e-05, "reward": 0.7517499923706055, "reward_std": 0.21746620535850525, "rewards/DrugCombAccuracyCOTORM/mean": 0.7287499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.42015671730041504, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.6718547940254211, "step": 7117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 431.5625, "completions/min_length": 349.0, "epoch": 10.467647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.0441393852233887, "kl": 0.00840134872123599, "learning_rate": 5.463897448163e-07, "loss": 8.344483649125323e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/mean_length": 425.5, "completions/min_length": 385.0, "epoch": 10.469117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.019738754257559776, "kl": 0.009808065951801836, "learning_rate": 5.46261963797786e-07, "loss": 9.774706268217415e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 473.375, "completions/min_length": 419.0, "epoch": 10.470588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.6774318218231201, "kl": 0.01632595481351018, "learning_rate": 5.46134179731651e-07, "loss": 0.00016567111015319824, "reward": 0.675000011920929, "reward_std": 0.4256991147994995, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 7120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 455.375, "completions/min_length": 378.0, "epoch": 10.472058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.1251153945922852, "kl": 0.009308048756793141, "learning_rate": 5.460063926263128e-07, "loss": 9.290128946304321e-05, "reward": 0.942187488079071, "reward_std": 0.16351844370365143, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/mean_length": 464.375, "completions/min_length": 328.0, "epoch": 10.473529411764705, "frac_reward_zero_std": 0.0, "grad_norm": 1.2335596084594727, "kl": 0.011224412824958563, "learning_rate": 5.458786024901903e-07, "loss": 0.00011202692985534668, "reward": 0.3930000066757202, "reward_std": 0.26127690076828003, "rewards/DrugCombAccuracyCOTORM/mean": 0.3141666650772095, "rewards/DrugCombAccuracyCOTORM/std": 0.414034366607666, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.3751543164253235, "step": 7122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 475.875, "completions/min_length": 416.0, "epoch": 10.475, "frac_reward_zero_std": 0.5, "grad_norm": 0.9449490904808044, "kl": 0.009372340980917215, "learning_rate": 5.457508093317013e-07, "loss": 9.398907423019409e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 7123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 462.1875, "completions/min_length": 421.0, "epoch": 10.476470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.008820326067507267, "kl": 0.008262893534265459, "learning_rate": 5.456230131592647e-07, "loss": 8.235227141994983e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 7124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 490.0625, "completions/min_length": 454.0, "epoch": 10.477941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.02097552828490734, "kl": 0.010143999941647053, "learning_rate": 5.454952139812995e-07, "loss": 0.00010042588110081851, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 441.0, "completions/min_length": 371.0, "epoch": 10.479411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.0192515030503273, "kl": 0.009934426983818412, "learning_rate": 5.453674118062248e-07, "loss": 9.99170879367739e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 428.0, "completions/min_length": 346.0, "epoch": 10.480882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.012549812905490398, "kl": 0.008494146051816642, "learning_rate": 5.452396066424598e-07, "loss": 8.452741894870996e-05, "reward": 0.8416666984558105, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.8333333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.17213258147239685, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 7127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 457.0, "completions/min_length": 358.0, "epoch": 10.48235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8491746783256531, "kl": 0.011534061981365085, "learning_rate": 5.451117984984239e-07, "loss": 0.00011568883201107383, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 441.4375, "completions/min_length": 406.0, "epoch": 10.483823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.9037559032440186, "kl": 0.01167754689231515, "learning_rate": 5.44983987382537e-07, "loss": 0.00011648982763290405, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 7129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 448.9375, "completions/min_length": 408.0, "epoch": 10.485294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.8957580327987671, "kl": 0.007683271309360862, "learning_rate": 5.448561733032187e-07, "loss": 7.696856482652947e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 449.875, "completions/min_length": 404.0, "epoch": 10.486764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9663048982620239, "kl": 0.006400730111636221, "learning_rate": 5.447283562688893e-07, "loss": 6.408244371414185e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 442.75, "completions/min_length": 379.0, "epoch": 10.488235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9806317687034607, "kl": 0.014957064297050238, "learning_rate": 5.44600536287969e-07, "loss": 0.0001483056548750028, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 7132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 489.625, "completions/min_length": 413.0, "epoch": 10.489705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.3395073413848877, "kl": 0.009448796743527055, "learning_rate": 5.444727133688783e-07, "loss": 9.471494558965787e-05, "reward": 0.8676170110702515, "reward_std": 0.013450654223561287, "rewards/DrugCombAccuracyCOTORM/mean": 0.8524900078773499, "rewards/DrugCombAccuracyCOTORM/std": 0.15454871952533722, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.856249988079071, "rewards/DrugCombCoverageCOTORM/std": 0.15041610598564148, "step": 7133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 462.8125, "completions/min_length": 418.0, "epoch": 10.491176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.016365284100174904, "kl": 0.00853677827399224, "learning_rate": 5.443448875200378e-07, "loss": 8.541951683582738e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 434.0, "completions/min_length": 393.0, "epoch": 10.492647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.013963392935693264, "kl": 0.009277477627620101, "learning_rate": 5.442170587498684e-07, "loss": 9.244441025657579e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 458.0625, "completions/min_length": 425.0, "epoch": 10.494117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.9015442728996277, "kl": 0.010351666016504169, "learning_rate": 5.440892270667909e-07, "loss": 0.00010348619980504736, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 462.625, "completions/min_length": 392.0, "epoch": 10.495588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.3270466327667236, "kl": 0.009387801634147763, "learning_rate": 5.439613924792267e-07, "loss": 9.305030107498169e-05, "reward": 0.9312499761581421, "reward_std": 0.13611315190792084, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 7137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 439.9375, "completions/min_length": 376.0, "epoch": 10.49705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.012843986973166466, "kl": 0.009614935261197388, "learning_rate": 5.438335549955973e-07, "loss": 9.62200720096007e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 506.875, "completions/min_length": 431.0, "epoch": 10.498529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.9965011477470398, "kl": 0.008197229006327689, "learning_rate": 5.43705714624324e-07, "loss": 8.187612547772005e-05, "reward": 0.6994583606719971, "reward_std": 0.12231215834617615, "rewards/DrugCombAccuracyCOTORM/mean": 0.6438541412353516, "rewards/DrugCombAccuracyCOTORM/std": 0.4177210330963135, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.18726837635040283, "step": 7139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 452.25, "completions/min_length": 388.0, "epoch": 10.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.010993188247084618, "kl": 0.008763764053583145, "learning_rate": 5.435778713738292e-07, "loss": 8.760724449530244e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 434.875, "completions/min_length": 384.0, "epoch": 10.501470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.011223316192626953, "kl": 0.009237125050276518, "learning_rate": 5.434500252525342e-07, "loss": 9.298494114773348e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/mean_length": 469.1875, "completions/min_length": 375.0, "epoch": 10.50294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.014617087319493294, "kl": 0.009210271062329412, "learning_rate": 5.433221762688616e-07, "loss": 9.249612776329741e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 486.1875, "completions/min_length": 408.0, "epoch": 10.504411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.03619785234332085, "kl": 0.013081239070743322, "learning_rate": 5.431943244312337e-07, "loss": 0.0001296143454965204, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 465.0, "completions/min_length": 402.0, "epoch": 10.505882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.7729799747467041, "kl": 0.008835909771732986, "learning_rate": 5.43066469748073e-07, "loss": 8.819997310638428e-05, "reward": 0.9833333492279053, "reward_std": 0.047140445560216904, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 436.5625, "completions/min_length": 389.0, "epoch": 10.507352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.02011544443666935, "kl": 0.00940458185505122, "learning_rate": 5.429386122278022e-07, "loss": 9.477388812229037e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 454.375, "completions/min_length": 391.0, "epoch": 10.508823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.017679186537861824, "kl": 0.010255416389554739, "learning_rate": 5.428107518788445e-07, "loss": 0.0001033507360261865, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/mean_length": 507.3125, "completions/min_length": 418.0, "epoch": 10.510294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 1.5650825500488281, "kl": 0.012012511142529547, "learning_rate": 5.426828887096227e-07, "loss": 0.00012035667896270752, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 7147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/mean_length": 484.4375, "completions/min_length": 416.0, "epoch": 10.511764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8210381269454956, "kl": 0.009496876504272223, "learning_rate": 5.4255502272856e-07, "loss": 9.550154209136963e-05, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 7148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 447.3125, "completions/min_length": 380.0, "epoch": 10.513235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0602697134017944, "kl": 0.00954716990236193, "learning_rate": 5.424271539440804e-07, "loss": 9.501098247710615e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 439.6875, "completions/min_length": 392.0, "epoch": 10.514705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 0.9307348728179932, "kl": 0.0076607175869867206, "learning_rate": 5.422992823646069e-07, "loss": 7.75456428527832e-05, "reward": 0.9619500041007996, "reward_std": 0.10762164741754532, "rewards/DrugCombAccuracyCOTORM/mean": 0.9539999961853027, "rewards/DrugCombAccuracyCOTORM/std": 0.18400000035762787, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.987500011920929, "rewards/DrugCombCoverageCOTORM/std": 0.05000000074505806, "step": 7150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 440.0, "completions/min_length": 394.0, "epoch": 10.516176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.012246206402778625, "kl": 0.007411854458041489, "learning_rate": 5.421714079985641e-07, "loss": 7.388678204733878e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 427.375, "completions/min_length": 366.0, "epoch": 10.51764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.03299243003129959, "kl": 0.009355467045679688, "learning_rate": 5.420435308543756e-07, "loss": 9.318340744357556e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 425.25, "completions/min_length": 345.0, "epoch": 10.519117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.035412222146987915, "kl": 0.011510971700772643, "learning_rate": 5.419156509404655e-07, "loss": 0.00011544901644811034, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 427.4375, "completions/min_length": 356.0, "epoch": 10.520588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.0063592195510864, "kl": 0.012753966264426708, "learning_rate": 5.417877682652585e-07, "loss": 0.00012905150651931763, "reward": 0.6546041965484619, "reward_std": 0.047317225486040115, "rewards/DrugCombAccuracyCOTORM/mean": 0.5962499976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.4203629493713379, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.2687419056892395, "step": 7154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 450.625, "completions/min_length": 393.0, "epoch": 10.522058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 1.0067169666290283, "kl": 0.014086977113038301, "learning_rate": 5.41659882837179e-07, "loss": 0.0001407568051945418, "reward": 0.5, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 457.125, "completions/min_length": 382.0, "epoch": 10.523529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9480498433113098, "kl": 0.012269368162378669, "learning_rate": 5.41531994664652e-07, "loss": 0.00012260826770216227, "reward": 0.5625, "reward_std": 0.051754921674728394, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 7156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/mean_length": 502.375, "completions/min_length": 406.0, "epoch": 10.525, "frac_reward_zero_std": 0.0, "grad_norm": 1.8736575841903687, "kl": 0.01583484443835914, "learning_rate": 5.414041037561021e-07, "loss": 0.00016066431999206543, "reward": 0.3959391415119171, "reward_std": 0.2754814624786377, "rewards/DrugCombAccuracyCOTORM/mean": 0.3074239194393158, "rewards/DrugCombAccuracyCOTORM/std": 0.4189569652080536, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.6366579532623291, "step": 7157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 459.0625, "completions/min_length": 383.0, "epoch": 10.526470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.8573724031448364, "kl": 0.007607465027831495, "learning_rate": 5.412762101199548e-07, "loss": 7.62721974751912e-05, "reward": 0.5874999761581421, "reward_std": 0.02314550243318081, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 7158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 467.3125, "completions/min_length": 433.0, "epoch": 10.527941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.0968103408813477, "kl": 0.008901575114578009, "learning_rate": 5.411483137646352e-07, "loss": 8.901208639144897e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/mean_length": 465.125, "completions/min_length": 368.0, "epoch": 10.529411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9359354376792908, "kl": 0.014991468982771039, "learning_rate": 5.410204146985689e-07, "loss": 0.00015111645916476846, "reward": 0.8552500009536743, "reward_std": 0.14858387410640717, "rewards/DrugCombAccuracyCOTORM/mean": 0.8424999713897705, "rewards/DrugCombAccuracyCOTORM/std": 0.2566666603088379, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.4901813864707947, "step": 7160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 426.75, "completions/min_length": 361.0, "epoch": 10.530882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.2180711030960083, "kl": 0.010607129312120378, "learning_rate": 5.408925129301812e-07, "loss": 0.00010656748054316267, "reward": 0.8500000238418579, "reward_std": 0.20701967179775238, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/mean_length": 487.1875, "completions/min_length": 391.0, "epoch": 10.532352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.3345791101455688, "kl": 0.01022043521516025, "learning_rate": 5.407646084678985e-07, "loss": 0.00010322779417037964, "reward": 0.550000011920929, "reward_std": 0.3181980550289154, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 7162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 464.6875, "completions/min_length": 391.0, "epoch": 10.533823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.007208228111267, "kl": 0.008112768875434995, "learning_rate": 5.406367013201465e-07, "loss": 8.104133303277194e-05, "reward": 0.7648749947547913, "reward_std": 0.14512228965759277, "rewards/DrugCombAccuracyCOTORM/mean": 0.7178124785423279, "rewards/DrugCombAccuracyCOTORM/std": 0.3762499988079071, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 7163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 459.8125, "completions/min_length": 406.0, "epoch": 10.535294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.0415599346160889, "kl": 0.01246567303314805, "learning_rate": 5.405087914953514e-07, "loss": 0.00012627243995666504, "reward": 0.7875000238418579, "reward_std": 0.22951814532279968, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/mean_length": 530.125, "completions/min_length": 440.0, "epoch": 10.536764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.7827365398406982, "kl": 0.009264279156923294, "learning_rate": 5.403808790019397e-07, "loss": 9.213248267769814e-05, "reward": 0.8179374933242798, "reward_std": 0.202492356300354, "rewards/DrugCombAccuracyCOTORM/mean": 0.7860937118530273, "rewards/DrugCombAccuracyCOTORM/std": 0.3898426294326782, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.890625, "rewards/DrugCombCoverageCOTORM/std": 0.269548237323761, "step": 7165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/mean_length": 532.5, "completions/min_length": 442.0, "epoch": 10.538235294117648, "frac_reward_zero_std": 0.5, "grad_norm": 0.8035150766372681, "kl": 0.008407676010392606, "learning_rate": 5.402529638483379e-07, "loss": 8.439458906650543e-05, "reward": 0.7442708015441895, "reward_std": 0.10976476222276688, "rewards/DrugCombAccuracyCOTORM/mean": 0.6979166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.356000155210495, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 7166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 416.9375, "completions/min_length": 355.0, "epoch": 10.53970588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.015577953308820724, "kl": 0.008092145319096744, "learning_rate": 5.401250460429726e-07, "loss": 8.122032159008086e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/mean_length": 468.125, "completions/min_length": 373.0, "epoch": 10.541176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.1455904245376587, "kl": 0.010296652093529701, "learning_rate": 5.399971255942708e-07, "loss": 0.00010137227218365297, "reward": 0.9089166522026062, "reward_std": 0.16972768306732178, "rewards/DrugCombAccuracyCOTORM/mean": 0.8887500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.30663496255874634, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 7168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 447.8125, "completions/min_length": 384.0, "epoch": 10.54264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.013793934136629105, "kl": 0.008474115747958422, "learning_rate": 5.398692025106597e-07, "loss": 8.441660611424595e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 441.25, "completions/min_length": 380.0, "epoch": 10.544117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.015264777466654778, "kl": 0.009000647463835776, "learning_rate": 5.397412768005664e-07, "loss": 9.000153659144416e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 472.0, "completions/min_length": 411.0, "epoch": 10.545588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.8288937211036682, "kl": 0.009818260674364865, "learning_rate": 5.396133484724184e-07, "loss": 9.855037205852568e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 7171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/mean_length": 510.0, "completions/min_length": 389.0, "epoch": 10.547058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.7952777147293091, "kl": 0.010836256202310324, "learning_rate": 5.394854175346432e-07, "loss": 0.000108345877379179, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 7172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 447.0, "completions/min_length": 392.0, "epoch": 10.548529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.021556558087468147, "kl": 0.009050338994711637, "learning_rate": 5.393574839956685e-07, "loss": 8.972477371571586e-05, "reward": 0.6410000324249268, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5824999809265137, "rewards/DrugCombAccuracyCOTORM/std": 0.43119215965270996, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 7173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 453.5, "completions/min_length": 409.0, "epoch": 10.55, "frac_reward_zero_std": 1.0, "grad_norm": 0.013037072494626045, "kl": 0.007393174688331783, "learning_rate": 5.392295478639225e-07, "loss": 7.369773084064946e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 492.3125, "completions/min_length": 411.0, "epoch": 10.551470588235293, "frac_reward_zero_std": 0.0, "grad_norm": 1.443291187286377, "kl": 0.012126571382395923, "learning_rate": 5.39101609147833e-07, "loss": 0.0001219138503074646, "reward": 0.8677083253860474, "reward_std": 0.2659962773323059, "rewards/DrugCombAccuracyCOTORM/mean": 0.8541666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.3435921370983124, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.5072392821311951, "step": 7175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 463.75, "completions/min_length": 398.0, "epoch": 10.552941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.024870911613106728, "kl": 0.0090484784450382, "learning_rate": 5.389736678558288e-07, "loss": 9.013481758302078e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 444.8125, "completions/min_length": 378.0, "epoch": 10.554411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.01010215189307928, "kl": 0.007518609752878547, "learning_rate": 5.388457239963377e-07, "loss": 7.524043030571193e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 473.75, "completions/min_length": 400.0, "epoch": 10.555882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.8693445324897766, "kl": 0.01209672843106091, "learning_rate": 5.387177775777887e-07, "loss": 0.00012050336954416707, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 7178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 486.375, "completions/min_length": 425.0, "epoch": 10.55735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8721784949302673, "kl": 0.008878053282387555, "learning_rate": 5.385898286086105e-07, "loss": 8.887797594070435e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/mean_length": 525.125, "completions/min_length": 444.0, "epoch": 10.558823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.05166113004088402, "kl": 0.00997479259967804, "learning_rate": 5.384618770972319e-07, "loss": 9.983091149479151e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 491.8125, "completions/min_length": 434.0, "epoch": 10.560294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.5225671529769897, "kl": 0.012060063891112804, "learning_rate": 5.383339230520823e-07, "loss": 0.00012033432722091675, "reward": 0.5375000238418579, "reward_std": 0.43379276990890503, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 414.4375, "completions/min_length": 345.0, "epoch": 10.561764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.04564270004630089, "kl": 0.011972304549999535, "learning_rate": 5.38205966481591e-07, "loss": 0.00012073465768480673, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 453.5, "completions/min_length": 383.0, "epoch": 10.563235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.013005580753087997, "kl": 0.008364122943021357, "learning_rate": 5.38078007394187e-07, "loss": 8.28569900477305e-05, "reward": 0.6713333129882812, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6100000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.40279027819633484, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 7183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 455.0625, "completions/min_length": 385.0, "epoch": 10.564705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.8255075812339783, "kl": 0.012617772677913308, "learning_rate": 5.379500457983005e-07, "loss": 0.0001264921884285286, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 7184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 458.125, "completions/min_length": 364.0, "epoch": 10.566176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.083080768585205, "kl": 0.010498984484001994, "learning_rate": 5.378220817023609e-07, "loss": 0.00010499243217054754, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 7185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/mean_length": 515.3125, "completions/min_length": 448.0, "epoch": 10.56764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.7804883122444153, "kl": 0.009145409101620317, "learning_rate": 5.376941151147983e-07, "loss": 9.116803266806528e-05, "reward": 0.671875, "reward_std": 0.008685990236699581, "rewards/DrugCombAccuracyCOTORM/mean": 0.6104910373687744, "rewards/DrugCombAccuracyCOTORM/std": 0.40242692828178406, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8348214626312256, "rewards/DrugCombCoverageCOTORM/std": 0.17368309199810028, "step": 7186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 412.9375, "completions/min_length": 357.0, "epoch": 10.569117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.9966039061546326, "kl": 0.010903435060754418, "learning_rate": 5.375661460440427e-07, "loss": 0.00010874772851821035, "reward": 0.9551249742507935, "reward_std": 0.12692566215991974, "rewards/DrugCombAccuracyCOTORM/mean": 0.9478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.20874999463558197, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 7187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 444.6875, "completions/min_length": 401.0, "epoch": 10.570588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.010697856545448303, "kl": 0.007823700550943613, "learning_rate": 5.374381744985246e-07, "loss": 7.791502866894007e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 429.9375, "completions/min_length": 384.0, "epoch": 10.572058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.02131390944123268, "kl": 0.010796370333991945, "learning_rate": 5.373102004866743e-07, "loss": 0.00010829584061866626, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 503.0, "completions/min_length": 403.0, "epoch": 10.573529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 0.9813193678855896, "kl": 0.01319790375418961, "learning_rate": 5.371822240169224e-07, "loss": 0.00013350231165532023, "reward": 0.6875, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 7190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 440.1875, "completions/min_length": 385.0, "epoch": 10.575, "frac_reward_zero_std": 1.0, "grad_norm": 1.6336339712142944, "kl": 0.045803123619407415, "learning_rate": 5.370542450976997e-07, "loss": 0.00044973165495321155, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 7191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 499.1875, "completions/min_length": 468.0, "epoch": 10.576470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.014729747548699379, "kl": 0.007985788863152266, "learning_rate": 5.36926263737437e-07, "loss": 7.995526539161801e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 669.0, "completions/mean_length": 558.375, "completions/min_length": 460.0, "epoch": 10.577941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.379925012588501, "kl": 0.01308233360759914, "learning_rate": 5.367982799445655e-07, "loss": 0.00012973323464393616, "reward": 0.3257708251476288, "reward_std": 0.2914591133594513, "rewards/DrugCombAccuracyCOTORM/mean": 0.21841144561767578, "rewards/DrugCombAccuracyCOTORM/std": 0.3330132067203522, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5104166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.23149076104164124, "step": 7193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 456.1875, "completions/min_length": 405.0, "epoch": 10.579411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 1.9934755563735962, "kl": 0.011523180641233921, "learning_rate": 5.366702937275165e-07, "loss": 0.00011594220995903015, "reward": 0.75, "reward_std": 0.35523033142089844, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/mean_length": 475.5, "completions/min_length": 391.0, "epoch": 10.580882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.9674705266952515, "kl": 0.008589874487370253, "learning_rate": 5.365423050947213e-07, "loss": 8.563697338104248e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 420.75, "completions/min_length": 337.0, "epoch": 10.58235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9764953255653381, "kl": 0.01089476840570569, "learning_rate": 5.364143140546116e-07, "loss": 0.00010784715414047241, "reward": 0.7677083611488342, "reward_std": 0.10914872586727142, "rewards/DrugCombAccuracyCOTORM/mean": 0.7291666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.3095695972442627, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.5072392821311951, "step": 7196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 443.125, "completions/min_length": 359.0, "epoch": 10.583823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 1.5890504121780396, "kl": 0.0124530338216573, "learning_rate": 5.36286320615619e-07, "loss": 0.00012297183275222778, "reward": 0.5313750505447388, "reward_std": 0.45389124751091003, "rewards/DrugCombAccuracyCOTORM/mean": 0.4728125035762787, "rewards/DrugCombAccuracyCOTORM/std": 0.490604966878891, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.53125, "rewards/DrugCombCoverageCOTORM/std": 0.7869744300842285, "step": 7197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 481.8125, "completions/min_length": 414.0, "epoch": 10.58529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.012913620099425316, "kl": 0.008244312601163983, "learning_rate": 5.361583247861755e-07, "loss": 8.22691508801654e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 7198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 446.3125, "completions/min_length": 398.0, "epoch": 10.586764705882352, "frac_reward_zero_std": 1.0, "grad_norm": 0.03950474411249161, "kl": 0.009024031460285187, "learning_rate": 5.360303265747131e-07, "loss": 9.066049824468791e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 431.75, "completions/min_length": 395.0, "epoch": 10.588235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.010389055125415325, "kl": 0.008072319673374295, "learning_rate": 5.359023259896638e-07, "loss": 8.092832285910845e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 444.125, "completions/min_length": 364.0, "epoch": 10.589705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.011063436977565289, "kl": 0.008622245746664703, "learning_rate": 5.357743230394601e-07, "loss": 8.662457548780367e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 461.6875, "completions/min_length": 430.0, "epoch": 10.591176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.03325319662690163, "kl": 0.010423258878290653, "learning_rate": 5.356463177325346e-07, "loss": 0.00010362880129832774, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 430.5625, "completions/min_length": 368.0, "epoch": 10.592647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.02134053222835064, "kl": 0.009178654756397009, "learning_rate": 5.3551831007732e-07, "loss": 9.151041740551591e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 438.75, "completions/min_length": 400.0, "epoch": 10.594117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.0878793001174927, "kl": 0.010690958704799414, "learning_rate": 5.35390300082249e-07, "loss": 0.00010699033737182617, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 457.375, "completions/min_length": 401.0, "epoch": 10.595588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.020570965483784676, "kl": 0.010025640949606895, "learning_rate": 5.352622877557544e-07, "loss": 0.00010075929458253086, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 7205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 440.8125, "completions/min_length": 378.0, "epoch": 10.597058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.1631395816802979, "kl": 0.009440957917831838, "learning_rate": 5.351342731062697e-07, "loss": 9.359046816825867e-05, "reward": 0.84375, "reward_std": 0.22469735145568848, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 444.4375, "completions/min_length": 395.0, "epoch": 10.598529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.010589784011244774, "kl": 0.008334695012308657, "learning_rate": 5.350062561422279e-07, "loss": 8.243011689046398e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 426.0, "completions/min_length": 381.0, "epoch": 10.6, "frac_reward_zero_std": 1.0, "grad_norm": 0.017955368384718895, "kl": 0.008154189446941018, "learning_rate": 5.348782368720625e-07, "loss": 8.110089402180165e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 474.0, "completions/min_length": 361.0, "epoch": 10.601470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.3347643613815308, "kl": 0.011834616074338555, "learning_rate": 5.347502153042073e-07, "loss": 0.0001179426908493042, "reward": 0.6240832805633545, "reward_std": 0.38800984621047974, "rewards/DrugCombAccuracyCOTORM/mean": 0.5587500333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.4726926386356354, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7708333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.49767982959747314, "step": 7209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 464.5, "completions/min_length": 436.0, "epoch": 10.602941176470589, "frac_reward_zero_std": 0.0, "grad_norm": 1.2938226461410522, "kl": 0.01092402613721788, "learning_rate": 5.346221914470958e-07, "loss": 0.00010921061038970947, "reward": 0.5095000267028809, "reward_std": 0.22401538491249084, "rewards/DrugCombAccuracyCOTORM/mean": 0.4025000035762787, "rewards/DrugCombAccuracyCOTORM/std": 0.4833700954914093, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.1666666567325592, "step": 7210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 434.9375, "completions/min_length": 393.0, "epoch": 10.604411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.010370356030762196, "kl": 0.009534001699648798, "learning_rate": 5.344941653091619e-07, "loss": 9.555587894283235e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/mean_length": 469.25, "completions/min_length": 375.0, "epoch": 10.605882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.8664947152137756, "kl": 0.010942323599010706, "learning_rate": 5.343661368988397e-07, "loss": 0.00010971841402351856, "reward": 0.9619500041007996, "reward_std": 0.10762164741754532, "rewards/DrugCombAccuracyCOTORM/mean": 0.9539999961853027, "rewards/DrugCombAccuracyCOTORM/std": 0.18400000035762787, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.987500011920929, "rewards/DrugCombCoverageCOTORM/std": 0.05000000074505806, "step": 7212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 415.0625, "completions/min_length": 372.0, "epoch": 10.60735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.014064745977520943, "kl": 0.009041503188200295, "learning_rate": 5.342381062245633e-07, "loss": 9.138494351645932e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 467.625, "completions/min_length": 409.0, "epoch": 10.608823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.009845172986388206, "kl": 0.00703504157718271, "learning_rate": 5.341100732947672e-07, "loss": 7.085663673933595e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 496.8125, "completions/min_length": 440.0, "epoch": 10.610294117647058, "frac_reward_zero_std": 0.0, "grad_norm": 1.22072172164917, "kl": 0.00925477349665016, "learning_rate": 5.339820381178858e-07, "loss": 9.223818778991699e-05, "reward": 0.2096666693687439, "reward_std": 0.1723005771636963, "rewards/DrugCombAccuracyCOTORM/mean": 0.12666666507720947, "rewards/DrugCombAccuracyCOTORM/std": 0.2527713179588318, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0833333432674408, "rewards/DrugCombCoverageCOTORM/std": 0.6382848024368286, "step": 7215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 451.375, "completions/min_length": 376.0, "epoch": 10.611764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.010098314844071865, "kl": 0.007963667740114033, "learning_rate": 5.338540007023538e-07, "loss": 7.947454287204891e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 487.6875, "completions/min_length": 445.0, "epoch": 10.613235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8275392651557922, "kl": 0.011522743152454495, "learning_rate": 5.337259610566057e-07, "loss": 0.00011501817789394408, "reward": 0.659250020980835, "reward_std": 0.14119592308998108, "rewards/DrugCombAccuracyCOTORM/mean": 0.5779687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.4977610111236572, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.08539126068353653, "step": 7217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 447.25, "completions/min_length": 372.0, "epoch": 10.614705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.015629786998033524, "kl": 0.009521158412098885, "learning_rate": 5.335979191890767e-07, "loss": 9.422787115909159e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 459.125, "completions/min_length": 419.0, "epoch": 10.616176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.8936067223548889, "kl": 0.00805512280203402, "learning_rate": 5.334698751082018e-07, "loss": 8.048117160797119e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 7219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 475.625, "completions/min_length": 427.0, "epoch": 10.617647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.02618197165429592, "kl": 0.008753145695663989, "learning_rate": 5.333418288224163e-07, "loss": 8.736344170756638e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 460.25, "completions/min_length": 374.0, "epoch": 10.619117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.016164448112249374, "kl": 0.010099452221766114, "learning_rate": 5.332137803401556e-07, "loss": 9.970381506718695e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 419.1875, "completions/min_length": 367.0, "epoch": 10.620588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.010849092155694962, "kl": 0.0076387308072298765, "learning_rate": 5.33085729669855e-07, "loss": 7.623680721735582e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 457.0, "completions/min_length": 423.0, "epoch": 10.62205882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.011654473841190338, "kl": 0.009284393396228552, "learning_rate": 5.329576768199501e-07, "loss": 9.259667422156781e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 431.625, "completions/min_length": 374.0, "epoch": 10.623529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.015541888773441315, "kl": 0.008272831211797893, "learning_rate": 5.32829621798877e-07, "loss": 8.389264257857576e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 472.0625, "completions/min_length": 416.0, "epoch": 10.625, "frac_reward_zero_std": 1.0, "grad_norm": 0.010809305123984814, "kl": 0.009447426768019795, "learning_rate": 5.327015646150716e-07, "loss": 9.366086305817589e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 469.4375, "completions/min_length": 401.0, "epoch": 10.626470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.3161752223968506, "kl": 0.009789193980395794, "learning_rate": 5.325735052769696e-07, "loss": 9.85450460575521e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 445.8125, "completions/min_length": 378.0, "epoch": 10.62794117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.008703859522938728, "kl": 0.0067133811535313725, "learning_rate": 5.324454437930077e-07, "loss": 6.740159733453766e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 481.8125, "completions/min_length": 461.0, "epoch": 10.629411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.7004601955413818, "kl": 0.008803998702205718, "learning_rate": 5.323173801716221e-07, "loss": 8.80211591720581e-05, "reward": 0.25349998474121094, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.20750001072883606, "rewards/DrugCombAccuracyCOTORM/std": 0.326751708984375, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.125, "rewards/DrugCombCoverageCOTORM/std": 0.9098230004310608, "step": 7228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 463.4375, "completions/min_length": 400.0, "epoch": 10.630882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 1.5162556171417236, "kl": 0.014040666166692972, "learning_rate": 5.321893144212492e-07, "loss": 0.00014136172831058502, "reward": 0.7640625238418579, "reward_std": 0.35938945412635803, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.394405335187912, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.65625, "rewards/DrugCombCoverageCOTORM/std": 0.5691733956336975, "step": 7229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 450.125, "completions/min_length": 391.0, "epoch": 10.632352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.28913152217865, "kl": 0.013094390276819468, "learning_rate": 5.320612465503258e-07, "loss": 0.00012887641787528992, "reward": 0.6864583492279053, "reward_std": 0.45884037017822266, "rewards/DrugCombAccuracyCOTORM/mean": 0.6666666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.4714045524597168, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.53125, "rewards/DrugCombCoverageCOTORM/std": 0.7180703282356262, "step": 7230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 430.0625, "completions/min_length": 389.0, "epoch": 10.633823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.010509892366826534, "kl": 0.008587759803049266, "learning_rate": 5.319331765672886e-07, "loss": 8.6276835645549e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 401.4375, "completions/min_length": 331.0, "epoch": 10.635294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 1.3515194654464722, "kl": 0.011780941393226385, "learning_rate": 5.318051044805744e-07, "loss": 0.00011811405420303345, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 450.25, "completions/min_length": 399.0, "epoch": 10.636764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.020913314074277878, "kl": 0.006737954448908567, "learning_rate": 5.316770302986205e-07, "loss": 6.723601836711168e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 471.125, "completions/min_length": 405.0, "epoch": 10.638235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.4633232355117798, "kl": 0.01832386595197022, "learning_rate": 5.315489540298641e-07, "loss": 0.00018489733338356018, "reward": 0.7749999761581421, "reward_std": 0.3919961452484131, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 7234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 426.1875, "completions/min_length": 372.0, "epoch": 10.639705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.017032025381922722, "kl": 0.007914130226708949, "learning_rate": 5.314208756827424e-07, "loss": 8.013113983906806e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 488.875, "completions/min_length": 406.0, "epoch": 10.641176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.1682470291852951, "kl": 0.011432851199060678, "learning_rate": 5.312927952656929e-07, "loss": 0.0001136543505708687, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 466.5, "completions/min_length": 415.0, "epoch": 10.64264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8718402981758118, "kl": 0.010883507318794727, "learning_rate": 5.311647127871531e-07, "loss": 0.00010912120342254639, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 693.0, "completions/mean_length": 503.875, "completions/min_length": 348.0, "epoch": 10.644117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.7817734479904175, "kl": 0.010238279472105205, "learning_rate": 5.31036628255561e-07, "loss": 0.00010282546281814575, "reward": 0.9750000238418579, "reward_std": 0.0707106739282608, "rewards/DrugCombAccuracyCOTORM/mean": 0.96875, "rewards/DrugCombAccuracyCOTORM/std": 0.125, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 411.6875, "completions/min_length": 358.0, "epoch": 10.645588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9498307704925537, "kl": 0.009143823059275746, "learning_rate": 5.309085416793546e-07, "loss": 9.070336818695068e-05, "reward": 0.8500000238418579, "reward_std": 0.20701967179775238, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 445.1875, "completions/min_length": 366.0, "epoch": 10.647058823529411, "frac_reward_zero_std": 0.0, "grad_norm": 1.2582224607467651, "kl": 0.012058873660862446, "learning_rate": 5.307804530669715e-07, "loss": 0.00011993199586868286, "reward": 0.7312500476837158, "reward_std": 0.41806113719940186, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 7240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 491.0625, "completions/min_length": 407.0, "epoch": 10.648529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.17848904430866241, "kl": 0.016870031831786036, "learning_rate": 5.306523624268499e-07, "loss": 0.000168491606018506, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 453.125, "completions/min_length": 406.0, "epoch": 10.65, "frac_reward_zero_std": 0.0, "grad_norm": 1.3189678192138672, "kl": 0.01422748970799148, "learning_rate": 5.305242697674285e-07, "loss": 0.00014351308345794678, "reward": 0.75, "reward_std": 0.35523033142089844, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 445.375, "completions/min_length": 413.0, "epoch": 10.651470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.017774909734725952, "kl": 0.011527352267876267, "learning_rate": 5.303961750971453e-07, "loss": 0.00011587213521124795, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 451.875, "completions/min_length": 411.0, "epoch": 10.652941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9078145623207092, "kl": 0.014062898699194193, "learning_rate": 5.30268078424439e-07, "loss": 0.0001406744122505188, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 424.875, "completions/min_length": 330.0, "epoch": 10.654411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.7413195967674255, "kl": 0.008838958339765668, "learning_rate": 5.301399797577482e-07, "loss": 8.916854858398438e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/mean_length": 495.5625, "completions/min_length": 367.0, "epoch": 10.655882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.087607979774475, "kl": 0.01462357142008841, "learning_rate": 5.300118791055121e-07, "loss": 0.00014720181934535503, "reward": 0.9035625457763672, "reward_std": 0.17979580163955688, "rewards/DrugCombAccuracyCOTORM/mean": 0.8853124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.314830482006073, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.953125, "rewards/DrugCombCoverageCOTORM/std": 0.1359764039516449, "step": 7246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/mean_length": 460.125, "completions/min_length": 382.0, "epoch": 10.657352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.8313068151473999, "kl": 0.007763205794617534, "learning_rate": 5.298837764761692e-07, "loss": 7.76994347688742e-05, "reward": 0.550000011920929, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 450.9375, "completions/min_length": 382.0, "epoch": 10.658823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.9860591292381287, "kl": 0.013487125630490482, "learning_rate": 5.297556718781587e-07, "loss": 0.0001338580623269081, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 493.125, "completions/min_length": 446.0, "epoch": 10.660294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.8464266657829285, "kl": 0.008550138329155743, "learning_rate": 5.296275653199198e-07, "loss": 8.5268504335545e-05, "reward": 0.9551249742507935, "reward_std": 0.12692566215991974, "rewards/DrugCombAccuracyCOTORM/mean": 0.9478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.20874999463558197, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 7249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 473.6875, "completions/min_length": 420.0, "epoch": 10.661764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.020368708297610283, "kl": 0.009298916789703071, "learning_rate": 5.294994568098919e-07, "loss": 9.433977538719773e-05, "reward": 0.7666666507720947, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.3442651927471161, "step": 7250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 475.8125, "completions/min_length": 441.0, "epoch": 10.663235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.007586643565446138, "kl": 0.007598665542900562, "learning_rate": 5.293713463565145e-07, "loss": 7.568910223199055e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 475.875, "completions/min_length": 398.0, "epoch": 10.66470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.095441222190857, "kl": 0.011228034272789955, "learning_rate": 5.292432339682272e-07, "loss": 0.00011136382818222046, "reward": 0.567187488079071, "reward_std": 0.17537428438663483, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.1875, "rewards/DrugCombCoverageCOTORM/std": 0.981070876121521, "step": 7252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 466.4375, "completions/min_length": 394.0, "epoch": 10.666176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9197196364402771, "kl": 0.009612591471523046, "learning_rate": 5.291151196534696e-07, "loss": 9.713191684568301e-05, "reward": 0.5845000147819519, "reward_std": 0.0647699236869812, "rewards/DrugCombAccuracyCOTORM/mean": 0.5274999737739563, "rewards/DrugCombAccuracyCOTORM/std": 0.49293002486228943, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.6540472507476807, "step": 7253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/mean_length": 550.8125, "completions/min_length": 444.0, "epoch": 10.66764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 9.158276557922363, "kl": 0.19406896375585347, "learning_rate": 5.289870034206815e-07, "loss": 0.001858823001384735, "reward": 0.7444138526916504, "reward_std": 0.31942492723464966, "rewards/DrugCombAccuracyCOTORM/mean": 0.698312520980835, "rewards/DrugCombAccuracyCOTORM/std": 0.40327906608581543, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8576388955116272, "rewards/DrugCombCoverageCOTORM/std": 0.49771857261657715, "step": 7254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/mean_length": 541.125, "completions/min_length": 451.0, "epoch": 10.669117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 1.521113395690918, "kl": 0.014505954226478934, "learning_rate": 5.28858885278303e-07, "loss": 0.00014274567365646362, "reward": 0.7988749742507935, "reward_std": 0.3594454526901245, "rewards/DrugCombAccuracyCOTORM/mean": 0.7603124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.43035051226615906, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.2719528079032898, "step": 7255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/mean_length": 390.5, "completions/min_length": 353.0, "epoch": 10.670588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.009405874647200108, "kl": 0.0066880646627396345, "learning_rate": 5.287307652347743e-07, "loss": 6.740554817952216e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 440.8125, "completions/min_length": 403.0, "epoch": 10.672058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.05665741488337517, "kl": 0.009876691503450274, "learning_rate": 5.286026432985353e-07, "loss": 9.897237760014832e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 431.8125, "completions/min_length": 387.0, "epoch": 10.673529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.016104379668831825, "kl": 0.007527919369749725, "learning_rate": 5.284745194780266e-07, "loss": 7.557742355857044e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 468.875, "completions/min_length": 425.0, "epoch": 10.675, "frac_reward_zero_std": 1.0, "grad_norm": 0.010483507066965103, "kl": 0.007373314932920039, "learning_rate": 5.283463937816887e-07, "loss": 7.321575685637072e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 445.4375, "completions/min_length": 395.0, "epoch": 10.676470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.014476858079433441, "kl": 0.01348510617390275, "learning_rate": 5.282182662179622e-07, "loss": 0.00013441164628602564, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 439.75, "completions/min_length": 372.0, "epoch": 10.677941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9052090048789978, "kl": 0.010544862481765449, "learning_rate": 5.280901367952875e-07, "loss": 0.00010513044253457338, "reward": 0.625, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.25, "rewards/DrugCombCoverageCOTORM/std": 1.0, "step": 7261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 425.0625, "completions/min_length": 363.0, "epoch": 10.679411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.01246081292629242, "kl": 0.008812068495899439, "learning_rate": 5.279620055221058e-07, "loss": 8.844361582305282e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 449.125, "completions/min_length": 408.0, "epoch": 10.680882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.0716890096664429, "kl": 0.01206968433689326, "learning_rate": 5.278338724068579e-07, "loss": 0.00012038648128509521, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 472.375, "completions/min_length": 430.0, "epoch": 10.68235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.02299354411661625, "kl": 0.009809381910599768, "learning_rate": 5.27705737457985e-07, "loss": 9.692645107861608e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 460.1875, "completions/min_length": 382.0, "epoch": 10.683823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.049545273184776306, "kl": 0.011612076545134187, "learning_rate": 5.275776006839282e-07, "loss": 0.00011515011283336207, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 491.8125, "completions/min_length": 440.0, "epoch": 10.685294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.022667165845632553, "kl": 0.010360321612097323, "learning_rate": 5.274494620931288e-07, "loss": 0.00010324835602659732, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 430.625, "completions/min_length": 382.0, "epoch": 10.686764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.015100309625267982, "kl": 0.008927257382310927, "learning_rate": 5.273213216940284e-07, "loss": 8.913830242818221e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 460.0625, "completions/min_length": 368.0, "epoch": 10.688235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 0.9611595869064331, "kl": 0.011839275946840644, "learning_rate": 5.271931794950682e-07, "loss": 0.00011855697812279686, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 458.0625, "completions/min_length": 404.0, "epoch": 10.689705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.02456836588680744, "kl": 0.008234620443545282, "learning_rate": 5.270650355046904e-07, "loss": 8.196175622288138e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 7269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 445.0625, "completions/min_length": 364.0, "epoch": 10.691176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.8946335911750793, "kl": 0.011175058083608747, "learning_rate": 5.269368897313367e-07, "loss": 0.0001104447219404392, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/mean_length": 402.5, "completions/min_length": 369.0, "epoch": 10.69264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.011949368752539158, "kl": 0.009273125557228923, "learning_rate": 5.268087421834486e-07, "loss": 9.289140143664554e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/mean_length": 402.1875, "completions/min_length": 344.0, "epoch": 10.694117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.0196020603179932, "kl": 0.011874240590259433, "learning_rate": 5.266805928694684e-07, "loss": 0.00011771917343139648, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/mean_length": 482.3125, "completions/min_length": 387.0, "epoch": 10.695588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.7555522322654724, "kl": 0.009909011423587799, "learning_rate": 5.265524417978385e-07, "loss": 9.898282587528229e-05, "reward": 0.7900000214576721, "reward_std": 0.1506810039281845, "rewards/DrugCombAccuracyCOTORM/mean": 0.7583333253860474, "rewards/DrugCombAccuracyCOTORM/std": 0.33277732133865356, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 7273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 432.4375, "completions/min_length": 379.0, "epoch": 10.697058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.8998157978057861, "kl": 0.011125432327389717, "learning_rate": 5.264242889770007e-07, "loss": 0.0001115715058404021, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/mean_length": 480.75, "completions/min_length": 362.0, "epoch": 10.698529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 0.8655258417129517, "kl": 0.008653558790683746, "learning_rate": 5.262961344153977e-07, "loss": 8.7294916738756e-05, "reward": 0.7519904375076294, "reward_std": 0.19824865460395813, "rewards/DrugCombAccuracyCOTORM/mean": 0.7170714139938354, "rewards/DrugCombAccuracyCOTORM/std": 0.419081449508667, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7833333015441895, "rewards/DrugCombCoverageCOTORM/std": 0.5027332901954651, "step": 7275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 422.5, "completions/min_length": 387.0, "epoch": 10.7, "frac_reward_zero_std": 0.5, "grad_norm": 1.1726205348968506, "kl": 0.010988596710376441, "learning_rate": 5.26167978121472e-07, "loss": 0.00011102855205535889, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 425.75, "completions/min_length": 384.0, "epoch": 10.701470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.009249217808246613, "kl": 0.006808142876252532, "learning_rate": 5.26039820103666e-07, "loss": 6.806867895647883e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/mean_length": 476.75, "completions/min_length": 375.0, "epoch": 10.702941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9519541263580322, "kl": 0.010002768831327558, "learning_rate": 5.259116603704226e-07, "loss": 0.00010026805102825165, "reward": 0.9010952711105347, "reward_std": 0.18542854487895966, "rewards/DrugCombAccuracyCOTORM/mean": 0.8867857456207275, "rewards/DrugCombAccuracyCOTORM/std": 0.31127017736434937, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 7278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/mean_length": 466.875, "completions/min_length": 396.0, "epoch": 10.704411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.8292537927627563, "kl": 0.00909606390632689, "learning_rate": 5.257834989301844e-07, "loss": 9.027485793922096e-05, "reward": 0.925000011920929, "reward_std": 0.13887301087379456, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 7279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 501.0625, "completions/min_length": 454.0, "epoch": 10.705882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.9610195755958557, "kl": 0.009467862779274583, "learning_rate": 5.256553357913947e-07, "loss": 9.420602873433381e-05, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 482.375, "completions/min_length": 441.0, "epoch": 10.70735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0325897932052612, "kl": 0.010228566941805184, "learning_rate": 5.255271709624965e-07, "loss": 0.00010161846876144409, "reward": 0.8999999761581421, "reward_std": 0.10690448433160782, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.22360680997371674, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 445.5625, "completions/min_length": 327.0, "epoch": 10.708823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.8632562756538391, "kl": 0.008494076668284833, "learning_rate": 5.253990044519328e-07, "loss": 8.502297714585438e-05, "reward": 0.9879167079925537, "reward_std": 0.03417681157588959, "rewards/DrugCombAccuracyCOTORM/mean": 0.987500011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.05000000074505806, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 7282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 448.4375, "completions/min_length": 377.0, "epoch": 10.71029411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.4970223903656006, "kl": 0.012582467868924141, "learning_rate": 5.252708362681471e-07, "loss": 0.00012584030628204346, "reward": 0.93022620677948, "reward_std": 0.19735011458396912, "rewards/DrugCombAccuracyCOTORM/mean": 0.9205952286720276, "rewards/DrugCombAccuracyCOTORM/std": 0.25459450483322144, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 421.25, "completions/min_length": 369.0, "epoch": 10.711764705882352, "frac_reward_zero_std": 1.0, "grad_norm": 0.00834880955517292, "kl": 0.007688770652748644, "learning_rate": 5.251426664195824e-07, "loss": 7.683251169510186e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 403.0625, "completions/min_length": 327.0, "epoch": 10.713235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.014661266468465328, "kl": 0.007550276699475944, "learning_rate": 5.250144949146826e-07, "loss": 7.507945701945573e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 451.0, "completions/min_length": 381.0, "epoch": 10.714705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.01045360416173935, "kl": 0.008788528153672814, "learning_rate": 5.248863217618912e-07, "loss": 8.668203372508287e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 456.9375, "completions/min_length": 420.0, "epoch": 10.716176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.1238818168640137, "kl": 0.011497583240270615, "learning_rate": 5.247581469696518e-07, "loss": 0.00011422485113143921, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 7287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 463.1875, "completions/min_length": 408.0, "epoch": 10.717647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.1478084325790405, "kl": 0.011952504748478532, "learning_rate": 5.246299705464085e-07, "loss": 0.0001190925613627769, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 462.625, "completions/min_length": 420.0, "epoch": 10.719117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.2675365209579468, "kl": 0.010067354771308601, "learning_rate": 5.245017925006049e-07, "loss": 9.88023602985777e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 434.75, "completions/min_length": 392.0, "epoch": 10.720588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9673241972923279, "kl": 0.009876174270175397, "learning_rate": 5.243736128406853e-07, "loss": 9.845175372902304e-05, "reward": 0.824999988079071, "reward_std": 0.24348656833171844, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 7290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/mean_length": 504.6875, "completions/min_length": 418.0, "epoch": 10.722058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.8116341829299927, "kl": 0.008736365009099245, "learning_rate": 5.242454315750937e-07, "loss": 8.743575745029375e-05, "reward": 0.9104166626930237, "reward_std": 0.09705483913421631, "rewards/DrugCombAccuracyCOTORM/mean": 0.8958333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.1912434846162796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.17078252136707306, "step": 7291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 441.0625, "completions/min_length": 332.0, "epoch": 10.723529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.9559953212738037, "kl": 0.011139485519379377, "learning_rate": 5.241172487122744e-07, "loss": 0.000111361026938539, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 463.125, "completions/min_length": 378.0, "epoch": 10.725, "frac_reward_zero_std": 0.5, "grad_norm": 1.349214792251587, "kl": 0.013094161404296756, "learning_rate": 5.239890642606719e-07, "loss": 0.00012855781824328005, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 482.0625, "completions/min_length": 428.0, "epoch": 10.726470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.7298387289047241, "kl": 0.011206911760382354, "learning_rate": 5.238608782287306e-07, "loss": 0.0001122206449508667, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 7294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 458.9375, "completions/min_length": 407.0, "epoch": 10.727941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.017957117408514023, "kl": 0.009116397704929113, "learning_rate": 5.23732690624895e-07, "loss": 9.030403452925384e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 479.25, "completions/min_length": 415.0, "epoch": 10.729411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.011670514941215515, "kl": 0.008662597043439746, "learning_rate": 5.236045014576097e-07, "loss": 8.668941882206127e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 451.125, "completions/min_length": 378.0, "epoch": 10.730882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.8270180821418762, "kl": 0.007536102784797549, "learning_rate": 5.234763107353197e-07, "loss": 7.573311449959874e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 450.0, "completions/min_length": 421.0, "epoch": 10.73235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.163568377494812, "kl": 0.014807907864451408, "learning_rate": 5.233481184664695e-07, "loss": 0.00014435709454119205, "reward": 0.7979166507720947, "reward_std": 0.21610505878925323, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 7298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 443.1875, "completions/min_length": 379.0, "epoch": 10.733823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 1.1684845685958862, "kl": 0.009840264217928052, "learning_rate": 5.232199246595046e-07, "loss": 9.818375110626221e-05, "reward": 0.40625, "reward_std": 0.3729080259799957, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 7299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 467.25, "completions/min_length": 417.0, "epoch": 10.735294117647058, "frac_reward_zero_std": 0.0, "grad_norm": 1.11020028591156, "kl": 0.011979803442955017, "learning_rate": 5.230917293228698e-07, "loss": 0.00012055039405822754, "reward": 0.7051249742507935, "reward_std": 0.3339453339576721, "rewards/DrugCombAccuracyCOTORM/mean": 0.6353124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.48780280351638794, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 7300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 450.4375, "completions/min_length": 367.0, "epoch": 10.736764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.025523314252495766, "kl": 0.009967209305614233, "learning_rate": 5.229635324650104e-07, "loss": 0.00010014753934228793, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 456.8125, "completions/min_length": 407.0, "epoch": 10.738235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1135660409927368, "kl": 0.008960854378528893, "learning_rate": 5.228353340943713e-07, "loss": 8.904188871383667e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/mean_length": 529.1875, "completions/min_length": 391.0, "epoch": 10.739705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.0741304159164429, "kl": 0.01096367510035634, "learning_rate": 5.227071342193984e-07, "loss": 0.00010930030111921951, "reward": 0.737500011920929, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.3095695972442627, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 446.25, "completions/min_length": 400.0, "epoch": 10.741176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.017966050654649734, "kl": 0.018253207788802683, "learning_rate": 5.225789328485369e-07, "loss": 0.00018230229034088552, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 443.1875, "completions/min_length": 370.0, "epoch": 10.742647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.082139492034912, "kl": 0.010879129637032747, "learning_rate": 5.224507299902323e-07, "loss": 0.00010942661901935935, "reward": 0.887499988079071, "reward_std": 0.21001701056957245, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 523.0, "completions/min_length": 467.0, "epoch": 10.744117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.8614469170570374, "kl": 0.013669647509232163, "learning_rate": 5.223225256529308e-07, "loss": 0.00013701803982257843, "reward": 0.6583333611488342, "reward_std": 0.1287917196750641, "rewards/DrugCombAccuracyCOTORM/mean": 0.5833333134651184, "rewards/DrugCombAccuracyCOTORM/std": 0.4791968762874603, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.14907118678092957, "step": 7306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 448.5625, "completions/min_length": 399.0, "epoch": 10.745588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9234960079193115, "kl": 0.011490020668134093, "learning_rate": 5.221943198450775e-07, "loss": 0.00011504990834509954, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 7307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 472.375, "completions/min_length": 430.0, "epoch": 10.74705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.023182639852166176, "kl": 0.008413347066380084, "learning_rate": 5.220661125751186e-07, "loss": 8.4407496615313e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 470.6875, "completions/min_length": 409.0, "epoch": 10.748529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.8762819170951843, "kl": 0.009556451113894582, "learning_rate": 5.219379038515001e-07, "loss": 9.591877460479736e-05, "reward": 0.9802083373069763, "reward_std": 0.055979274213314056, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 7309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 511.4375, "completions/min_length": 445.0, "epoch": 10.75, "frac_reward_zero_std": 0.5, "grad_norm": 0.8533079028129578, "kl": 0.009669402614235878, "learning_rate": 5.21809693682668e-07, "loss": 9.699198562884703e-05, "reward": 0.887499988079071, "reward_std": 0.12026754766702652, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.22360680997371674, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.22360680997371674, "step": 7310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 479.75, "completions/min_length": 416.0, "epoch": 10.751470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.9069205522537231, "kl": 0.012739677680656314, "learning_rate": 5.216814820770684e-07, "loss": 0.00012792646884918213, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 468.8125, "completions/min_length": 410.0, "epoch": 10.75294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.014388416893780231, "kl": 0.008057522005401552, "learning_rate": 5.215532690431479e-07, "loss": 8.110431372188032e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/mean_length": 385.0625, "completions/min_length": 340.0, "epoch": 10.754411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.01096025574952364, "kl": 0.009288398548960686, "learning_rate": 5.214250545893522e-07, "loss": 9.273092291550711e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 425.3125, "completions/min_length": 379.0, "epoch": 10.755882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.024988772347569466, "kl": 0.00952520768623799, "learning_rate": 5.212968387241284e-07, "loss": 9.620415221434087e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 435.1875, "completions/min_length": 384.0, "epoch": 10.757352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.01151528861373663, "kl": 0.008300122688524425, "learning_rate": 5.211686214559227e-07, "loss": 8.224634802900255e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 378.125, "completions/min_length": 298.0, "epoch": 10.758823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.010391445830464363, "kl": 0.008170501329004765, "learning_rate": 5.210404027931818e-07, "loss": 8.12411162769422e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/mean_length": 452.5, "completions/min_length": 335.0, "epoch": 10.760294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.017113912850618362, "kl": 0.008281382150016725, "learning_rate": 5.209121827443522e-07, "loss": 8.316693129017949e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 452.3125, "completions/min_length": 403.0, "epoch": 10.761764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9618253707885742, "kl": 0.011241710977628827, "learning_rate": 5.207839613178813e-07, "loss": 0.0001112490936066024, "reward": 0.675000011920929, "reward_std": 0.18898223340511322, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.5773502588272095, "step": 7318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/mean_length": 401.375, "completions/min_length": 369.0, "epoch": 10.763235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01063813827931881, "kl": 0.00715143809793517, "learning_rate": 5.206557385222153e-07, "loss": 7.155063940444961e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/mean_length": 467.0, "completions/min_length": 396.0, "epoch": 10.764705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.3694946765899658, "kl": 0.009355276240967214, "learning_rate": 5.205275143658017e-07, "loss": 9.457021951675415e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 412.625, "completions/min_length": 371.0, "epoch": 10.766176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.24842369556427, "kl": 0.009429249679669738, "learning_rate": 5.203992888570873e-07, "loss": 9.427964687347412e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 455.125, "completions/min_length": 419.0, "epoch": 10.76764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.1848797798156738, "kl": 0.010984014719724655, "learning_rate": 5.202710620045194e-07, "loss": 0.00010963529348373413, "reward": 0.875, "reward_std": 0.1035098284482956, "rewards/DrugCombAccuracyCOTORM/mean": 0.84375, "rewards/DrugCombAccuracyCOTORM/std": 0.23935678601264954, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 490.875, "completions/min_length": 442.0, "epoch": 10.769117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.011632164008915424, "kl": 0.007859450182877481, "learning_rate": 5.201428338165452e-07, "loss": 7.814939453965053e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 479.3125, "completions/min_length": 405.0, "epoch": 10.770588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.788998544216156, "kl": 0.011302933329716325, "learning_rate": 5.200146043016123e-07, "loss": 0.00011479196837171912, "reward": 0.75, "reward_std": 0.20701967179775238, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/mean_length": 514.1875, "completions/min_length": 430.0, "epoch": 10.772058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 1.3358862400054932, "kl": 0.02372606610879302, "learning_rate": 5.198863734681677e-07, "loss": 0.00023749264073558152, "reward": 0.4699167013168335, "reward_std": 0.04212481901049614, "rewards/DrugCombAccuracyCOTORM/mean": 0.39520835876464844, "rewards/DrugCombAccuracyCOTORM/std": 0.28517499566078186, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5375000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.2305789738893509, "step": 7325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 440.75, "completions/min_length": 408.0, "epoch": 10.773529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.012905002571642399, "kl": 0.00834706041496247, "learning_rate": 5.197581413246592e-07, "loss": 8.32221849123016e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 449.6875, "completions/min_length": 342.0, "epoch": 10.775, "frac_reward_zero_std": 1.0, "grad_norm": 0.006937237922102213, "kl": 0.007115770946256816, "learning_rate": 5.196299078795343e-07, "loss": 7.160914537962526e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 486.0625, "completions/min_length": 418.0, "epoch": 10.776470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.02028646320104599, "kl": 0.010362334316596389, "learning_rate": 5.195016731412408e-07, "loss": 0.00010560072405496612, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 467.75, "completions/min_length": 404.0, "epoch": 10.777941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.1932224035263062, "kl": 0.011418882058933377, "learning_rate": 5.193734371182264e-07, "loss": 0.00011432543396949768, "reward": 0.6499999761581421, "reward_std": 0.39218372106552124, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 438.9375, "completions/min_length": 361.0, "epoch": 10.779411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.026818247511982918, "kl": 0.009366967482492328, "learning_rate": 5.192451998189391e-07, "loss": 9.426621545571834e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 430.875, "completions/min_length": 362.0, "epoch": 10.780882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.8261284828186035, "kl": 0.010587360942736268, "learning_rate": 5.191169612518265e-07, "loss": 0.00010570883750915527, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/mean_length": 472.9375, "completions/min_length": 377.0, "epoch": 10.782352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.1185917854309082, "kl": 0.010152696631848812, "learning_rate": 5.189887214253372e-07, "loss": 0.0001026540994644165, "reward": 0.27543333172798157, "reward_std": 0.2880485951900482, "rewards/DrugCombAccuracyCOTORM/mean": 0.17328472435474396, "rewards/DrugCombAccuracyCOTORM/std": 0.3493395447731018, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3680555522441864, "rewards/DrugCombCoverageCOTORM/std": 0.8517411947250366, "step": 7332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 499.1875, "completions/min_length": 447.0, "epoch": 10.783823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.062595248222351, "kl": 0.01629701245110482, "learning_rate": 5.188604803479187e-07, "loss": 0.00016375299310311675, "reward": 0.6410000324249268, "reward_std": 0.05616641789674759, "rewards/DrugCombAccuracyCOTORM/mean": 0.5824999809265137, "rewards/DrugCombAccuracyCOTORM/std": 0.43676844239234924, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.3333333432674408, "step": 7333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 452.4375, "completions/min_length": 379.0, "epoch": 10.785294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9196363091468811, "kl": 0.010400923667475581, "learning_rate": 5.187322380280196e-07, "loss": 0.00010334700345993042, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 467.0625, "completions/min_length": 395.0, "epoch": 10.786764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.1248860359191895, "kl": 0.01364425360225141, "learning_rate": 5.186039944740881e-07, "loss": 0.0001361189060844481, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 470.3125, "completions/min_length": 409.0, "epoch": 10.788235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.011718378402292728, "kl": 0.00867548247333616, "learning_rate": 5.184757496945725e-07, "loss": 8.656383579364046e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 462.0625, "completions/min_length": 415.0, "epoch": 10.78970588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9042695760726929, "kl": 0.010583080700598657, "learning_rate": 5.183475036979212e-07, "loss": 0.00010557472705841064, "reward": 0.512499988079071, "reward_std": 0.0353553369641304, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 7337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/mean_length": 472.6875, "completions/min_length": 413.0, "epoch": 10.791176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.7956238389015198, "kl": 0.008859756402671337, "learning_rate": 5.18219256492583e-07, "loss": 8.836761116981506e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 7338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 422.0, "completions/min_length": 323.0, "epoch": 10.79264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.01449776254594326, "kl": 0.009141835267655551, "learning_rate": 5.18091008087006e-07, "loss": 9.204573871102184e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/mean_length": 467.3125, "completions/min_length": 372.0, "epoch": 10.794117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.9803605079650879, "kl": 0.01292866060975939, "learning_rate": 5.179627584896392e-07, "loss": 0.00013001008483115584, "reward": 0.7166666984558105, "reward_std": 0.16885468363761902, "rewards/DrugCombAccuracyCOTORM/mean": 0.6666666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.43461349606513977, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.5055250525474548, "step": 7340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 460.8125, "completions/min_length": 355.0, "epoch": 10.795588235294117, "frac_reward_zero_std": 0.0, "grad_norm": 1.3441298007965088, "kl": 0.010214666835963726, "learning_rate": 5.178345077089316e-07, "loss": 0.00010234862565994263, "reward": 0.8187500238418579, "reward_std": 0.3905540704727173, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.704154372215271, "step": 7341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/mean_length": 492.75, "completions/min_length": 409.0, "epoch": 10.797058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 2.3973379135131836, "kl": 0.00941528205294162, "learning_rate": 5.177062557533315e-07, "loss": 9.336708899354562e-05, "reward": 0.7108333706855774, "reward_std": 0.17412596940994263, "rewards/DrugCombAccuracyCOTORM/mean": 0.675000011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.4229525923728943, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7083333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.582141637802124, "step": 7342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 465.375, "completions/min_length": 425.0, "epoch": 10.798529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.331794261932373, "kl": 0.011231546523049474, "learning_rate": 5.175780026312881e-07, "loss": 0.00011279061436653137, "reward": 0.6500000357627869, "reward_std": 0.4208287000656128, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 443.75, "completions/min_length": 402.0, "epoch": 10.8, "frac_reward_zero_std": 0.5, "grad_norm": 1.0860332250595093, "kl": 0.014311490347608924, "learning_rate": 5.174497483512505e-07, "loss": 0.00014262928743846714, "reward": 0.4713333249092102, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.36000001430511475, "rewards/DrugCombAccuracyCOTORM/std": 0.39273402094841003, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 7344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 451.3125, "completions/min_length": 401.0, "epoch": 10.801470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 1.0706318616867065, "kl": 0.013293815078213811, "learning_rate": 5.173214929216676e-07, "loss": 0.0001312345266342163, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 7345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 435.5, "completions/min_length": 382.0, "epoch": 10.802941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.245888113975525, "kl": 0.012780526420101523, "learning_rate": 5.171932363509886e-07, "loss": 0.00012740533566102386, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 433.3125, "completions/min_length": 405.0, "epoch": 10.804411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.013276582583785057, "kl": 0.008805922348983586, "learning_rate": 5.170649786476627e-07, "loss": 8.784193050814793e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/mean_length": 491.5, "completions/min_length": 403.0, "epoch": 10.805882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.8987950682640076, "kl": 0.011991789331659675, "learning_rate": 5.169367198201391e-07, "loss": 0.00011876225471496582, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 7348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 445.4375, "completions/min_length": 401.0, "epoch": 10.80735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.7468223571777344, "kl": 0.009563386905938387, "learning_rate": 5.168084598768673e-07, "loss": 9.544715430820361e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 427.375, "completions/min_length": 363.0, "epoch": 10.808823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.008074202574789524, "kl": 0.0065502646612003446, "learning_rate": 5.166801988262966e-07, "loss": 6.53287788736634e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/mean_length": 472.9375, "completions/min_length": 388.0, "epoch": 10.810294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.8888309597969055, "kl": 0.012160873506218195, "learning_rate": 5.165519366768768e-07, "loss": 0.0001225695013999939, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 460.125, "completions/min_length": 382.0, "epoch": 10.811764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 0.9306023716926575, "kl": 0.009363309713080525, "learning_rate": 5.164236734370573e-07, "loss": 9.321194374933839e-05, "reward": 0.893750011920929, "reward_std": 0.1971900761127472, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 411.875, "completions/min_length": 363.0, "epoch": 10.813235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.014619402587413788, "kl": 0.006832541082985699, "learning_rate": 5.162954091152876e-07, "loss": 6.811537605244666e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 438.3125, "completions/min_length": 391.0, "epoch": 10.814705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.018051303923130035, "kl": 0.014501424506306648, "learning_rate": 5.161671437200178e-07, "loss": 0.0001470931019866839, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/mean_length": 493.0625, "completions/min_length": 410.0, "epoch": 10.816176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.9970250725746155, "kl": 0.013178582768887281, "learning_rate": 5.160388772596973e-07, "loss": 0.00013245023728813976, "reward": 0.27291667461395264, "reward_std": 0.16948509216308594, "rewards/DrugCombAccuracyCOTORM/mean": 0.21875, "rewards/DrugCombAccuracyCOTORM/std": 0.36371922492980957, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.020833313465118408, "rewards/DrugCombCoverageCOTORM/std": 1.0144785642623901, "step": 7355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/mean_length": 480.1875, "completions/min_length": 408.0, "epoch": 10.81764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.666880488395691, "kl": 0.01583522278815508, "learning_rate": 5.159106097427763e-07, "loss": 0.00016504526138305664, "reward": 0.8921874761581421, "reward_std": 0.30493980646133423, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/mean_length": 523.75, "completions/min_length": 440.0, "epoch": 10.819117647058823, "frac_reward_zero_std": 0.0, "grad_norm": 1.3346081972122192, "kl": 0.011559325503185391, "learning_rate": 5.157823411777043e-07, "loss": 0.00011552870273590088, "reward": 0.7488095760345459, "reward_std": 0.2885800302028656, "rewards/DrugCombAccuracyCOTORM/mean": 0.6964285969734192, "rewards/DrugCombAccuracyCOTORM/std": 0.3784141540527344, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.2277100384235382, "step": 7357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/mean_length": 494.5625, "completions/min_length": 370.0, "epoch": 10.820588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.102792739868164, "kl": 0.009232141077518463, "learning_rate": 5.156540715729319e-07, "loss": 9.179079643217847e-05, "reward": 0.6589384078979492, "reward_std": 0.08314048498868942, "rewards/DrugCombAccuracyCOTORM/mean": 0.5958083868026733, "rewards/DrugCombAccuracyCOTORM/std": 0.43854820728302, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8229166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.21489663422107697, "step": 7358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 416.0625, "completions/min_length": 386.0, "epoch": 10.822058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.03599724546074867, "kl": 0.014177066506817937, "learning_rate": 5.155258009369089e-07, "loss": 0.00014116193051449955, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 7359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 470.5, "completions/min_length": 408.0, "epoch": 10.823529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 0.9430304765701294, "kl": 0.009650074178352952, "learning_rate": 5.153975292780853e-07, "loss": 9.668291022535414e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 7360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 456.625, "completions/min_length": 394.0, "epoch": 10.825, "frac_reward_zero_std": 1.0, "grad_norm": 0.015594528056681156, "kl": 0.008903174893930554, "learning_rate": 5.152692566049114e-07, "loss": 8.884561248123646e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 477.1875, "completions/min_length": 426.0, "epoch": 10.826470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9343609809875488, "kl": 0.013386080507189035, "learning_rate": 5.151409829258376e-07, "loss": 0.00013415515422821045, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 471.625, "completions/min_length": 380.0, "epoch": 10.827941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9529178738594055, "kl": 0.00869353348389268, "learning_rate": 5.150127082493141e-07, "loss": 8.648261427879333e-05, "reward": 0.7124999761581421, "reward_std": 0.24164614081382751, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 7363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 446.5625, "completions/min_length": 357.0, "epoch": 10.829411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.9457226395606995, "kl": 0.013918198528699577, "learning_rate": 5.148844325837913e-07, "loss": 0.00013782083988189697, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 691.0, "completions/mean_length": 541.375, "completions/min_length": 438.0, "epoch": 10.830882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.8509653806686401, "kl": 0.010018291650339961, "learning_rate": 5.1475615593772e-07, "loss": 0.0001004934310913086, "reward": 0.6212620139122009, "reward_std": 0.05200020968914032, "rewards/DrugCombAccuracyCOTORM/mean": 0.5708483457565308, "rewards/DrugCombAccuracyCOTORM/std": 0.4453883171081543, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6458333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5335937142372131, "step": 7365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 462.5625, "completions/min_length": 431.0, "epoch": 10.83235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.011343497782945633, "kl": 0.00778677174821496, "learning_rate": 5.146278783195503e-07, "loss": 7.777192513458431e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 483.875, "completions/min_length": 423.0, "epoch": 10.833823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.9118106961250305, "kl": 0.01076292502693832, "learning_rate": 5.14499599737733e-07, "loss": 0.00010674756049411371, "reward": 0.643750011920929, "reward_std": 0.0970548465847969, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.17078252136707306, "step": 7367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 466.25, "completions/min_length": 387.0, "epoch": 10.83529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.028393805027008057, "kl": 0.0110809993930161, "learning_rate": 5.143713202007188e-07, "loss": 0.00011156305845361203, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/mean_length": 533.875, "completions/min_length": 459.0, "epoch": 10.836764705882352, "frac_reward_zero_std": 0.0, "grad_norm": 1.3199498653411865, "kl": 0.010313300997950137, "learning_rate": 5.142430397169584e-07, "loss": 0.0001034960150718689, "reward": 0.3229166865348816, "reward_std": 0.3202528953552246, "rewards/DrugCombAccuracyCOTORM/mean": 0.1875, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7291666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.49018141627311707, "step": 7369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 445.625, "completions/min_length": 358.0, "epoch": 10.838235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.025517616420984268, "kl": 0.012159150675870478, "learning_rate": 5.141147582949026e-07, "loss": 0.0001214658550452441, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 509.6875, "completions/min_length": 473.0, "epoch": 10.839705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.0309737920761108, "kl": 0.012054814025759697, "learning_rate": 5.139864759430022e-07, "loss": 0.00012098997831344604, "reward": 0.7642499804496765, "reward_std": 0.19521954655647278, "rewards/DrugCombAccuracyCOTORM/mean": 0.7287499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.4155234098434448, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.28722813725471497, "step": 7371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/mean_length": 504.5, "completions/min_length": 391.0, "epoch": 10.841176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.8010380864143372, "kl": 0.008717383490875363, "learning_rate": 5.138581926697082e-07, "loss": 8.843636169331148e-05, "reward": 0.9291666746139526, "reward_std": 0.07572401314973831, "rewards/DrugCombAccuracyCOTORM/mean": 0.9166666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.14907118678092957, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.07453560829162598, "step": 7372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 477.0625, "completions/min_length": 381.0, "epoch": 10.842647058823529, "frac_reward_zero_std": 0.0, "grad_norm": 1.4208699464797974, "kl": 0.013708529993891716, "learning_rate": 5.137299084834715e-07, "loss": 0.00013652443885803223, "reward": 0.38749998807907104, "reward_std": 0.4334043264389038, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 7373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 467.1875, "completions/min_length": 415.0, "epoch": 10.844117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.010247969999909401, "kl": 0.007780940388329327, "learning_rate": 5.136016233927432e-07, "loss": 7.87324970588088e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/mean_length": 471.8125, "completions/min_length": 401.0, "epoch": 10.845588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.7937768697738647, "kl": 0.00876322656404227, "learning_rate": 5.134733374059744e-07, "loss": 8.788704872131348e-05, "reward": 0.8562500476837158, "reward_std": 0.05988579988479614, "rewards/DrugCombAccuracyCOTORM/mean": 0.84375, "rewards/DrugCombAccuracyCOTORM/std": 0.18726837635040283, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 440.4375, "completions/min_length": 387.0, "epoch": 10.847058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.8551961779594421, "kl": 0.014187239808961749, "learning_rate": 5.133450505316162e-07, "loss": 0.00014141213614493608, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 514.75, "completions/min_length": 425.0, "epoch": 10.848529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.8726267218589783, "kl": 0.00925102038308978, "learning_rate": 5.132167627781199e-07, "loss": 9.264248365070671e-05, "reward": 0.7884833812713623, "reward_std": 0.12931936979293823, "rewards/DrugCombAccuracyCOTORM/mean": 0.7642500400543213, "rewards/DrugCombAccuracyCOTORM/std": 0.3222503066062927, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7708333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.25489044189453125, "step": 7377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 462.125, "completions/min_length": 398.0, "epoch": 10.85, "frac_reward_zero_std": 1.0, "grad_norm": 0.009000697173178196, "kl": 0.006320282234810293, "learning_rate": 5.130884741539366e-07, "loss": 6.330233009066433e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 458.0625, "completions/min_length": 404.0, "epoch": 10.851470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.1456795930862427, "kl": 0.014700440689921379, "learning_rate": 5.129601846675179e-07, "loss": 0.00014615921827498823, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 466.625, "completions/min_length": 438.0, "epoch": 10.852941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 0.8989254236221313, "kl": 0.011705697514116764, "learning_rate": 5.128318943273149e-07, "loss": 0.00011684473429340869, "reward": 0.9500000476837158, "reward_std": 0.0690065324306488, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.13437095284461975, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/mean_length": 495.8125, "completions/min_length": 392.0, "epoch": 10.854411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.8749660849571228, "kl": 0.009461583220399916, "learning_rate": 5.127036031417792e-07, "loss": 9.446591138839722e-05, "reward": 0.594034731388092, "reward_std": 0.10579119622707367, "rewards/DrugCombAccuracyCOTORM/mean": 0.5389843583106995, "rewards/DrugCombAccuracyCOTORM/std": 0.4993995130062103, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6284722089767456, "rewards/DrugCombCoverageCOTORM/std": 0.6613114476203918, "step": 7381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 407.125, "completions/min_length": 349.0, "epoch": 10.855882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.012203714810311794, "kl": 0.007772411219775677, "learning_rate": 5.125753111193623e-07, "loss": 7.805916538927704e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 425.625, "completions/min_length": 361.0, "epoch": 10.85735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9993993639945984, "kl": 0.010696431621909142, "learning_rate": 5.124470182685157e-07, "loss": 0.00010650861077010632, "reward": 0.542187511920929, "reward_std": 0.16351844370365143, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 473.0625, "completions/min_length": 408.0, "epoch": 10.858823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.018853727728128433, "kl": 0.009943484095856547, "learning_rate": 5.123187245976911e-07, "loss": 9.958961163647473e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 430.0625, "completions/min_length": 348.0, "epoch": 10.860294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.01780341938138008, "kl": 0.008998170960694551, "learning_rate": 5.121904301153399e-07, "loss": 8.918094681575894e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/mean_length": 546.375, "completions/min_length": 454.0, "epoch": 10.861764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8755901455879211, "kl": 0.008773543522693217, "learning_rate": 5.120621348299141e-07, "loss": 8.837636414682493e-05, "reward": 0.7835574746131897, "reward_std": 0.16378238797187805, "rewards/DrugCombAccuracyCOTORM/mean": 0.7395379543304443, "rewards/DrugCombAccuracyCOTORM/std": 0.38487645983695984, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9192708730697632, "rewards/DrugCombCoverageCOTORM/std": 0.1974775791168213, "step": 7386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 469.5, "completions/min_length": 400.0, "epoch": 10.863235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.956732451915741, "kl": 0.00994526850990951, "learning_rate": 5.119338387498655e-07, "loss": 9.84259822871536e-05, "reward": 0.9552083015441895, "reward_std": 0.08368229866027832, "rewards/DrugCombAccuracyCOTORM/mean": 0.9479166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.145535409450531, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 7387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 500.6875, "completions/min_length": 453.0, "epoch": 10.864705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.014655746519565582, "kl": 0.008005498093552887, "learning_rate": 5.118055418836457e-07, "loss": 8.030091703403741e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 448.25, "completions/min_length": 411.0, "epoch": 10.866176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.9423819184303284, "kl": 0.01063562580384314, "learning_rate": 5.116772442397068e-07, "loss": 0.00010637898230925202, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 446.0625, "completions/min_length": 390.0, "epoch": 10.867647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.011316186748445034, "kl": 0.007630437961779535, "learning_rate": 5.115489458265005e-07, "loss": 7.696530519751832e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 748.0, "completions/mean_length": 527.9375, "completions/min_length": 435.0, "epoch": 10.869117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.7622358202934265, "kl": 0.009181258734315634, "learning_rate": 5.114206466524788e-07, "loss": 9.065207268577069e-05, "reward": 0.5476666688919067, "reward_std": 0.13482171297073364, "rewards/DrugCombAccuracyCOTORM/mean": 0.543958306312561, "rewards/DrugCombAccuracyCOTORM/std": 0.5006946325302124, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 7391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 473.9375, "completions/min_length": 391.0, "epoch": 10.870588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.8142395615577698, "kl": 0.010278755449689925, "learning_rate": 5.11292346726094e-07, "loss": 0.00010401755571365356, "reward": 0.9437500238418579, "reward_std": 0.13999362289905548, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 500.5625, "completions/min_length": 455.0, "epoch": 10.87205882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.00900301244109869, "kl": 0.007833424489945173, "learning_rate": 5.111640460557979e-07, "loss": 7.838425517547876e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 450.875, "completions/min_length": 401.0, "epoch": 10.873529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.03308970481157303, "kl": 0.009041805402375758, "learning_rate": 5.110357446500428e-07, "loss": 9.063332981895655e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 417.875, "completions/min_length": 332.0, "epoch": 10.875, "frac_reward_zero_std": 0.5, "grad_norm": 0.922633707523346, "kl": 0.01205811440013349, "learning_rate": 5.109074425172805e-07, "loss": 0.00011951848864555359, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 504.5625, "completions/min_length": 426.0, "epoch": 10.876470588235295, "frac_reward_zero_std": 0.0, "grad_norm": 2.2327675819396973, "kl": 0.04432798526249826, "learning_rate": 5.107791396659636e-07, "loss": 0.0004314035177230835, "reward": 0.49375003576278687, "reward_std": 0.3520258665084839, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 443.0625, "completions/min_length": 408.0, "epoch": 10.87794117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.011150422506034374, "kl": 0.008438100223429501, "learning_rate": 5.106508361045444e-07, "loss": 8.426795830018818e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 422.625, "completions/min_length": 379.0, "epoch": 10.879411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.011023152619600296, "kl": 0.008240607217885554, "learning_rate": 5.105225318414748e-07, "loss": 8.195570990210399e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 7398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 487.8125, "completions/min_length": 408.0, "epoch": 10.880882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.1089345216751099, "kl": 0.011037851450964808, "learning_rate": 5.103942268852077e-07, "loss": 0.00011102855205535889, "reward": 0.9028500318527222, "reward_std": 0.1799900233745575, "rewards/DrugCombAccuracyCOTORM/mean": 0.8832499980926514, "rewards/DrugCombAccuracyCOTORM/std": 0.3199307322502136, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9624999761581421, "rewards/DrugCombCoverageCOTORM/std": 0.15000000596046448, "step": 7399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 454.4375, "completions/min_length": 396.0, "epoch": 10.882352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.008417579345405102, "kl": 0.007334522786550224, "learning_rate": 5.102659212441952e-07, "loss": 7.337922579608858e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 432.125, "completions/min_length": 379.0, "epoch": 10.883823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.008178873918950558, "kl": 0.006356426980346441, "learning_rate": 5.101376149268897e-07, "loss": 6.37173798168078e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 480.5625, "completions/min_length": 388.0, "epoch": 10.885294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.8150956630706787, "kl": 0.007996536325663328, "learning_rate": 5.100093079417438e-07, "loss": 8.07344913482666e-05, "reward": 0.8653833270072937, "reward_std": 0.15730416774749756, "rewards/DrugCombAccuracyCOTORM/mean": 0.8395416736602783, "rewards/DrugCombAccuracyCOTORM/std": 0.30119824409484863, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.18130187690258026, "step": 7402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 418.6875, "completions/min_length": 325.0, "epoch": 10.886764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.0238533653318882, "kl": 0.008958135382272303, "learning_rate": 5.098810002972101e-07, "loss": 8.893311314750463e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 476.0, "completions/min_length": 399.0, "epoch": 10.888235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9597536325454712, "kl": 0.012026365846395493, "learning_rate": 5.09752692001741e-07, "loss": 0.00012119114398956299, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 7404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 496.3125, "completions/min_length": 454.0, "epoch": 10.889705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 0.9922369718551636, "kl": 0.008925915462896228, "learning_rate": 5.096243830637892e-07, "loss": 8.913210331229493e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 492.25, "completions/min_length": 405.0, "epoch": 10.891176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.009814747609198093, "kl": 0.0070534724509343505, "learning_rate": 5.094960734918076e-07, "loss": 6.980427133385092e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 456.0, "completions/min_length": 380.0, "epoch": 10.89264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.7666786909103394, "kl": 0.008628253359347582, "learning_rate": 5.093677632942484e-07, "loss": 8.645009802421555e-05, "reward": 0.699999988079071, "reward_std": 0.2507132589817047, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 7407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/mean_length": 538.875, "completions/min_length": 485.0, "epoch": 10.894117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.011308488436043262, "kl": 0.009047405095770955, "learning_rate": 5.092394524795649e-07, "loss": 8.945781155489385e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 7408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/mean_length": 441.8125, "completions/min_length": 330.0, "epoch": 10.895588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.5101003646850586, "kl": 0.012032905127853155, "learning_rate": 5.091111410562095e-07, "loss": 0.00012052059173583984, "reward": 0.9802083373069763, "reward_std": 0.055979274213314056, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 7409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/mean_length": 483.125, "completions/min_length": 395.0, "epoch": 10.897058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 1.873997449874878, "kl": 0.010120504768565297, "learning_rate": 5.089828290326353e-07, "loss": 0.00010002176713896915, "reward": 0.8666666746139526, "reward_std": 0.12344267964363098, "rewards/DrugCombAccuracyCOTORM/mean": 0.8333333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.27216553688049316, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 440.9375, "completions/min_length": 345.0, "epoch": 10.898529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.009423747658729553, "kl": 0.0068842386826872826, "learning_rate": 5.08854516417295e-07, "loss": 6.939023296581581e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 419.5, "completions/min_length": 359.0, "epoch": 10.9, "frac_reward_zero_std": 1.0, "grad_norm": 0.017587419599294662, "kl": 0.010493318433873355, "learning_rate": 5.087262032186418e-07, "loss": 0.00010537763591855764, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 456.9375, "completions/min_length": 369.0, "epoch": 10.901470588235295, "frac_reward_zero_std": 0.0, "grad_norm": 1.475542664527893, "kl": 0.011817068327218294, "learning_rate": 5.085978894451282e-07, "loss": 0.00011855363845825195, "reward": 0.7178333401679993, "reward_std": 0.36595210433006287, "rewards/DrugCombAccuracyCOTORM/mean": 0.6525000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.46795299649238586, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 7413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/mean_length": 388.0, "completions/min_length": 340.0, "epoch": 10.902941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.015745416283607483, "kl": 0.008737437892705202, "learning_rate": 5.084695751052074e-07, "loss": 8.77713318914175e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 503.0, "completions/min_length": 454.0, "epoch": 10.904411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9200379252433777, "kl": 0.009086847538128495, "learning_rate": 5.083412602073324e-07, "loss": 9.07157373148948e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 7415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/mean_length": 463.0, "completions/min_length": 378.0, "epoch": 10.905882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.008801196701824665, "kl": 0.00890914665069431, "learning_rate": 5.082129447599564e-07, "loss": 8.969834016170353e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 448.8125, "completions/min_length": 407.0, "epoch": 10.907352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.015778975561261177, "kl": 0.008029808988794684, "learning_rate": 5.080846287715322e-07, "loss": 8.024806447792798e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/mean_length": 505.25, "completions/min_length": 427.0, "epoch": 10.908823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.7626234292984009, "kl": 0.009212285513058305, "learning_rate": 5.079563122505132e-07, "loss": 9.596601012162864e-05, "reward": 0.7945312857627869, "reward_std": 0.0022097050677984953, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9453125, "rewards/DrugCombCoverageCOTORM/std": 0.06404344737529755, "step": 7418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/mean_length": 480.75, "completions/min_length": 377.0, "epoch": 10.910294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.8456693291664124, "kl": 0.009845723514445126, "learning_rate": 5.078279952053526e-07, "loss": 9.870529174804688e-05, "reward": 0.48749998211860657, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 7419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 468.5625, "completions/min_length": 428.0, "epoch": 10.911764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.012391462922096252, "kl": 0.009258813108317554, "learning_rate": 5.076996776445034e-07, "loss": 9.279340883949772e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/mean_length": 491.1875, "completions/min_length": 392.0, "epoch": 10.913235294117648, "frac_reward_zero_std": 0.5, "grad_norm": 1.0249264240264893, "kl": 0.009403728181496263, "learning_rate": 5.075713595764189e-07, "loss": 9.418278932571411e-05, "reward": 0.9869999885559082, "reward_std": 0.03676954284310341, "rewards/DrugCombAccuracyCOTORM/mean": 0.9837499856948853, "rewards/DrugCombAccuracyCOTORM/std": 0.06499999761581421, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 453.75, "completions/min_length": 371.0, "epoch": 10.91470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.00989480596035719, "kl": 0.010089673567563295, "learning_rate": 5.074430410095524e-07, "loss": 9.984728239942342e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 442.0625, "completions/min_length": 369.0, "epoch": 10.916176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 1.474260687828064, "kl": 0.014509430155158043, "learning_rate": 5.073147219523572e-07, "loss": 0.00014512240886688232, "reward": 0.2939999997615814, "reward_std": 0.24884970486164093, "rewards/DrugCombAccuracyCOTORM/mean": 0.18000000715255737, "rewards/DrugCombAccuracyCOTORM/std": 0.3341856002807617, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.3651483952999115, "step": 7423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 424.875, "completions/min_length": 354.0, "epoch": 10.91764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.017060795798897743, "kl": 0.009662668919190764, "learning_rate": 5.071864024132866e-07, "loss": 9.723853872856125e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 424.5625, "completions/min_length": 366.0, "epoch": 10.919117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.3221055269241333, "kl": 0.00865569175221026, "learning_rate": 5.070580824007941e-07, "loss": 8.728374814381823e-05, "reward": 0.9513333439826965, "reward_std": 0.13765011727809906, "rewards/DrugCombAccuracyCOTORM/mean": 0.9443750381469727, "rewards/DrugCombAccuracyCOTORM/std": 0.2224999964237213, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.1666666567325592, "step": 7425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 430.75, "completions/min_length": 348.0, "epoch": 10.920588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.013369206339120865, "kl": 0.011509136063978076, "learning_rate": 5.06929761923333e-07, "loss": 0.0001148429000750184, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/mean_length": 409.1875, "completions/min_length": 371.0, "epoch": 10.922058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.027063684538006783, "kl": 0.010680412175133824, "learning_rate": 5.068014409893566e-07, "loss": 0.00010732279042713344, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 461.375, "completions/min_length": 386.0, "epoch": 10.923529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.4146755933761597, "kl": 0.014590017031878233, "learning_rate": 5.066731196073186e-07, "loss": 0.0001457110047340393, "reward": 0.3687500059604645, "reward_std": 0.2522621154785156, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.1875, "rewards/DrugCombCoverageCOTORM/std": 0.8341662883758545, "step": 7428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 477.0625, "completions/min_length": 452.0, "epoch": 10.925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0107151810079813, "kl": 0.00831082509830594, "learning_rate": 5.065447977856722e-07, "loss": 8.298895409097895e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 421.8125, "completions/min_length": 319.0, "epoch": 10.926470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 1.4311660528182983, "kl": 0.02343499893322587, "learning_rate": 5.064164755328713e-07, "loss": 0.00023351609706878662, "reward": 0.512499988079071, "reward_std": 0.0353553369641304, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 7430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 456.5, "completions/min_length": 413.0, "epoch": 10.927941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.2662783861160278, "kl": 0.01138338167220354, "learning_rate": 5.06288152857369e-07, "loss": 0.00011447371798567474, "reward": 0.75, "reward_std": 0.26726123690605164, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 7431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 395.1875, "completions/min_length": 366.0, "epoch": 10.929411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.013949830085039139, "kl": 0.007753812940791249, "learning_rate": 5.061598297676192e-07, "loss": 7.754011312499642e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 452.9375, "completions/min_length": 403.0, "epoch": 10.930882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.0947060585021973, "kl": 0.007375780725851655, "learning_rate": 5.060315062720754e-07, "loss": 7.459521293640137e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 7433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 413.875, "completions/min_length": 341.0, "epoch": 10.93235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.009592237882316113, "kl": 0.008577640284784138, "learning_rate": 5.059031823791912e-07, "loss": 8.568588236812502e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/mean_length": 502.4375, "completions/min_length": 440.0, "epoch": 10.933823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.01154074165970087, "kl": 0.00658868788741529, "learning_rate": 5.057748580974204e-07, "loss": 6.558357563335449e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 453.625, "completions/min_length": 373.0, "epoch": 10.935294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.011914810165762901, "kl": 0.008807725738734007, "learning_rate": 5.056465334352165e-07, "loss": 8.884262933861464e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 7436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/mean_length": 535.5625, "completions/min_length": 441.0, "epoch": 10.936764705882354, "frac_reward_zero_std": 0.0, "grad_norm": 1.6385796070098877, "kl": 0.011363940080627799, "learning_rate": 5.055182084010333e-07, "loss": 0.00011508166790008545, "reward": 0.26071667671203613, "reward_std": 0.18989403545856476, "rewards/DrugCombAccuracyCOTORM/mean": 0.15141665935516357, "rewards/DrugCombAccuracyCOTORM/std": 0.2486838698387146, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3958333432674408, "rewards/DrugCombCoverageCOTORM/std": 0.36955931782722473, "step": 7437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 445.5625, "completions/min_length": 382.0, "epoch": 10.938235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.01660502329468727, "kl": 0.008061898057349026, "learning_rate": 5.053898830033244e-07, "loss": 8.039084787014872e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 447.25, "completions/min_length": 395.0, "epoch": 10.939705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.0213695764541626, "kl": 0.009826479363255203, "learning_rate": 5.052615572505436e-07, "loss": 9.700370719656348e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 475.5, "completions/min_length": 407.0, "epoch": 10.941176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.2115696668624878, "kl": 0.01095477445051074, "learning_rate": 5.05133231151145e-07, "loss": 0.00010893482249230146, "reward": 0.44999998807907104, "reward_std": 0.2070196568965912, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/mean_length": 437.0625, "completions/min_length": 410.0, "epoch": 10.94264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.04172441363334656, "kl": 0.010373380966484547, "learning_rate": 5.050049047135818e-07, "loss": 0.0001039086637319997, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 456.1875, "completions/min_length": 407.0, "epoch": 10.944117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.010882057249546051, "kl": 0.0079681834904477, "learning_rate": 5.048765779463085e-07, "loss": 7.953005842864513e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/mean_length": 497.875, "completions/min_length": 393.0, "epoch": 10.945588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.0380054712295532, "kl": 0.009327547624707222, "learning_rate": 5.047482508577783e-07, "loss": 9.354469511890784e-05, "reward": 0.9166666865348816, "reward_std": 0.0690065324306488, "rewards/DrugCombAccuracyCOTORM/mean": 0.8958333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.15957117080688477, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/mean_length": 498.625, "completions/min_length": 398.0, "epoch": 10.947058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.985466718673706, "kl": 0.010264557204209268, "learning_rate": 5.046199234564455e-07, "loss": 0.00010283137089572847, "reward": 0.6144132018089294, "reward_std": 0.16471782326698303, "rewards/DrugCombAccuracyCOTORM/mean": 0.5728124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.5018232464790344, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5616319179534912, "rewards/DrugCombCoverageCOTORM/std": 0.7944471836090088, "step": 7444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 439.3125, "completions/min_length": 370.0, "epoch": 10.948529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.013736826367676258, "kl": 0.007305903942324221, "learning_rate": 5.044915957507636e-07, "loss": 7.359557639574632e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 419.875, "completions/min_length": 364.0, "epoch": 10.95, "frac_reward_zero_std": 1.0, "grad_norm": 0.009624776430428028, "kl": 0.008479473413899541, "learning_rate": 5.043632677491869e-07, "loss": 8.459814125671983e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 431.0625, "completions/min_length": 401.0, "epoch": 10.951470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.011891595087945461, "kl": 0.009087088517844677, "learning_rate": 5.042349394601692e-07, "loss": 9.090661478694528e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 425.8125, "completions/min_length": 376.0, "epoch": 10.952941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.008393255062401295, "kl": 0.00557905132882297, "learning_rate": 5.041066108921645e-07, "loss": 5.565095489146188e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 451.8125, "completions/min_length": 378.0, "epoch": 10.954411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.8952771425247192, "kl": 0.008827382232993841, "learning_rate": 5.039782820536265e-07, "loss": 8.855760097503662e-05, "reward": 0.5422499775886536, "reward_std": 0.0643245130777359, "rewards/DrugCombAccuracyCOTORM/mean": 0.5137500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.5050000548362732, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3125, "rewards/DrugCombCoverageCOTORM/std": 0.9227073788642883, "step": 7449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 422.4375, "completions/min_length": 368.0, "epoch": 10.955882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.01254885271191597, "kl": 0.00913788890466094, "learning_rate": 5.038499529530094e-07, "loss": 9.13377371034585e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 449.125, "completions/min_length": 416.0, "epoch": 10.95735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.201566457748413, "kl": 0.02207517542410642, "learning_rate": 5.03721623598767e-07, "loss": 0.00022565602557733655, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 437.75, "completions/min_length": 381.0, "epoch": 10.958823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.5124760866165161, "kl": 0.012501793913543224, "learning_rate": 5.035932939993536e-07, "loss": 0.00012587755918502808, "reward": 0.8374999761581421, "reward_std": 0.22480149567127228, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 7452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 422.0625, "completions/min_length": 379.0, "epoch": 10.96029411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.013604024425148964, "kl": 0.008004131261259317, "learning_rate": 5.03464964163223e-07, "loss": 7.968636055011302e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 420.4375, "completions/min_length": 375.0, "epoch": 10.961764705882352, "frac_reward_zero_std": 1.0, "grad_norm": 0.015979966148734093, "kl": 0.011587322922423482, "learning_rate": 5.033366340988295e-07, "loss": 0.00011572500079637393, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 426.9375, "completions/min_length": 385.0, "epoch": 10.963235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.077201247215271, "kl": 0.01020275172777474, "learning_rate": 5.032083038146269e-07, "loss": 0.00010191628825850785, "reward": 0.5249999761581421, "reward_std": 0.04629100486636162, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.25, "rewards/DrugCombCoverageCOTORM/std": 1.0, "step": 7455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 446.5625, "completions/min_length": 382.0, "epoch": 10.964705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.015255468897521496, "kl": 0.008731385576538742, "learning_rate": 5.030799733190694e-07, "loss": 8.728473767405376e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 471.5, "completions/min_length": 394.0, "epoch": 10.966176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.9046061038970947, "kl": 0.007837231969460845, "learning_rate": 5.02951642620611e-07, "loss": 7.836352597223595e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/mean_length": 480.3125, "completions/min_length": 383.0, "epoch": 10.967647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.9550622701644897, "kl": 0.008535087807103992, "learning_rate": 5.028233117277057e-07, "loss": 8.508453902322799e-05, "reward": 0.875, "reward_std": 0.1035098284482956, "rewards/DrugCombAccuracyCOTORM/mean": 0.84375, "rewards/DrugCombAccuracyCOTORM/std": 0.23935678601264954, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 441.75, "completions/min_length": 398.0, "epoch": 10.969117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.11399994790554047, "kl": 0.013952828361652792, "learning_rate": 5.02694980648808e-07, "loss": 0.00014239028678275645, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 464.125, "completions/min_length": 398.0, "epoch": 10.970588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.01773706264793873, "kl": 0.010572190629318357, "learning_rate": 5.025666493923718e-07, "loss": 0.0001053730957210064, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 439.4375, "completions/min_length": 390.0, "epoch": 10.972058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.012887037359178066, "kl": 0.008698784280568361, "learning_rate": 5.024383179668511e-07, "loss": 8.658383740112185e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/mean_length": 424.625, "completions/min_length": 375.0, "epoch": 10.973529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.034845415502786636, "kl": 0.008427788619883358, "learning_rate": 5.023099863807002e-07, "loss": 8.38756823213771e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 452.9375, "completions/min_length": 407.0, "epoch": 10.975, "frac_reward_zero_std": 0.5, "grad_norm": 1.7443639039993286, "kl": 0.04766294173896313, "learning_rate": 5.021816546423732e-07, "loss": 0.00048315481399185956, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 399.5, "completions/min_length": 362.0, "epoch": 10.976470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.018370453268289566, "kl": 0.009830465074628592, "learning_rate": 5.020533227603244e-07, "loss": 9.837080142460763e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 396.6875, "completions/min_length": 333.0, "epoch": 10.977941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.010810711421072483, "kl": 0.007756469654850662, "learning_rate": 5.019249907430079e-07, "loss": 7.765179907437414e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/mean_length": 413.75, "completions/min_length": 372.0, "epoch": 10.979411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.038562264293432236, "kl": 0.014528244733810425, "learning_rate": 5.01796658598878e-07, "loss": 0.00014657771680504084, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 449.125, "completions/min_length": 409.0, "epoch": 10.980882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.0789566040039062, "kl": 0.008629087707959116, "learning_rate": 5.016683263363885e-07, "loss": 8.677691221237183e-05, "reward": 0.960812509059906, "reward_std": 0.11083897948265076, "rewards/DrugCombAccuracyCOTORM/mean": 0.9529687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.18812499940395355, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.0625, "step": 7467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 436.9375, "completions/min_length": 365.0, "epoch": 10.98235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.014106137678027153, "kl": 0.008638017694465816, "learning_rate": 5.01539993963994e-07, "loss": 8.640466694487259e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 485.625, "completions/min_length": 454.0, "epoch": 10.983823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.0172200202941895, "kl": 0.010922109009698033, "learning_rate": 5.014116614901487e-07, "loss": 0.00010974705219268799, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/mean_length": 406.1875, "completions/min_length": 367.0, "epoch": 10.985294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.009517593309283257, "kl": 0.0060393757303245366, "learning_rate": 5.012833289233066e-07, "loss": 6.010709694237448e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/mean_length": 512.375, "completions/min_length": 433.0, "epoch": 10.986764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9140354990959167, "kl": 0.008217527996748686, "learning_rate": 5.01154996271922e-07, "loss": 8.144229650497437e-05, "reward": 0.550000011920929, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 456.125, "completions/min_length": 418.0, "epoch": 10.988235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.020085759460926056, "kl": 0.010492775938473642, "learning_rate": 5.010266635444495e-07, "loss": 0.00010515803296584636, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/mean_length": 460.375, "completions/min_length": 399.0, "epoch": 10.989705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.107248067855835, "kl": 0.010778463678434491, "learning_rate": 5.008983307493426e-07, "loss": 0.00010861456394195557, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 474.375, "completions/min_length": 386.0, "epoch": 10.991176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.013529345393180847, "kl": 0.007620897493325174, "learning_rate": 5.007699978950562e-07, "loss": 7.562173414044082e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 440.9375, "completions/min_length": 373.0, "epoch": 10.992647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.023079130798578262, "kl": 0.01132293394766748, "learning_rate": 5.00641664990044e-07, "loss": 0.00011195598199265078, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 468.75, "completions/min_length": 425.0, "epoch": 10.994117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.012681112624704838, "kl": 0.01041872298810631, "learning_rate": 5.005133320427608e-07, "loss": 0.00010396752622909844, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 438.4375, "completions/min_length": 380.0, "epoch": 10.995588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9654257893562317, "kl": 0.010084426961839199, "learning_rate": 5.003849990616604e-07, "loss": 0.00010094046592712402, "reward": 0.4645833373069763, "reward_std": 0.06389319151639938, "rewards/DrugCombAccuracyCOTORM/mean": 0.4791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.5013870000839233, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.1875, "rewards/DrugCombCoverageCOTORM/std": 0.9105859398841858, "step": 7477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 460.4375, "completions/min_length": 410.0, "epoch": 10.99705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 1.372086763381958, "kl": 0.010814639739692211, "learning_rate": 5.002566660551975e-07, "loss": 0.00011035054922103882, "reward": 0.7894999980926514, "reward_std": 0.39062556624412537, "rewards/DrugCombAccuracyCOTORM/mean": 0.7603124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.43035051226615906, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.35939764976501465, "step": 7478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 462.125, "completions/min_length": 389.0, "epoch": 10.998529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.011241971515119076, "kl": 0.009560172562487423, "learning_rate": 5.001283330318258e-07, "loss": 9.607132233213633e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 516.3125, "completions/min_length": 433.0, "epoch": 11.0, "frac_reward_zero_std": 0.5, "grad_norm": 1.0748759508132935, "kl": 0.015774512896314263, "learning_rate": 5e-07, "loss": 0.00015625357627868652, "reward": 0.35324999690055847, "reward_std": 0.19425958395004272, "rewards/DrugCombAccuracyCOTORM/mean": 0.23583334684371948, "rewards/DrugCombAccuracyCOTORM/std": 0.3709038197994232, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6458333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.6690928339958191, "step": 7480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 446.5625, "completions/min_length": 421.0, "epoch": 11.001470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.016330767422914505, "kl": 0.010566329583525658, "learning_rate": 4.998716669681742e-07, "loss": 0.00010611409379635006, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 473.8125, "completions/min_length": 420.0, "epoch": 11.00294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.3678792715072632, "kl": 0.011508548865094781, "learning_rate": 4.997433339448026e-07, "loss": 0.00011453032493591309, "reward": 0.43125003576278687, "reward_std": 0.37801989912986755, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3125, "rewards/DrugCombCoverageCOTORM/std": 0.8732125163078308, "step": 7482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/mean_length": 408.5, "completions/min_length": 371.0, "epoch": 11.004411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.007044883444905281, "kl": 0.0063662228640168905, "learning_rate": 4.996150009383395e-07, "loss": 6.366285379044712e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/mean_length": 486.1875, "completions/min_length": 422.0, "epoch": 11.005882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 1.299522876739502, "kl": 0.012458478566259146, "learning_rate": 4.994866679572393e-07, "loss": 0.00012496858835220337, "reward": 0.6723124980926514, "reward_std": 0.36115479469299316, "rewards/DrugCombAccuracyCOTORM/mean": 0.6353124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.48780280351638794, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.65625, "rewards/DrugCombCoverageCOTORM/std": 0.4732423722743988, "step": 7484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 450.75, "completions/min_length": 358.0, "epoch": 11.007352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.0404057502746582, "kl": 0.01010066643357277, "learning_rate": 4.993583350099559e-07, "loss": 0.00010085850954055786, "reward": 0.637499988079071, "reward_std": 0.1505940705537796, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 448.75, "completions/min_length": 405.0, "epoch": 11.008823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.015531562268733978, "kl": 0.010862280149012804, "learning_rate": 4.992300021049439e-07, "loss": 0.00010809754894580692, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 441.8125, "completions/min_length": 367.0, "epoch": 11.010294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.9029058814048767, "kl": 0.010471033747307956, "learning_rate": 4.991016692506574e-07, "loss": 0.00010450380068505183, "reward": 0.6273333430290222, "reward_std": 0.04703797399997711, "rewards/DrugCombAccuracyCOTORM/mean": 0.5550000071525574, "rewards/DrugCombAccuracyCOTORM/std": 0.4665619134902954, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 7487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 457.25, "completions/min_length": 382.0, "epoch": 11.011764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0175931453704834, "kl": 0.00976356235332787, "learning_rate": 4.989733364555506e-07, "loss": 9.788572788238525e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 459.75, "completions/min_length": 412.0, "epoch": 11.013235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9596177339553833, "kl": 0.013468928867951035, "learning_rate": 4.988450037280779e-07, "loss": 0.00013384222984313965, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 7489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 441.875, "completions/min_length": 370.0, "epoch": 11.014705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.030655499547719955, "kl": 0.011130028520710766, "learning_rate": 4.987166710766934e-07, "loss": 0.00011048167652916163, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 468.4375, "completions/min_length": 417.0, "epoch": 11.016176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.015164665877819061, "kl": 0.010360775049775839, "learning_rate": 4.985883385098513e-07, "loss": 0.00010342546738684177, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 509.75, "completions/min_length": 461.0, "epoch": 11.01764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.3584164381027222, "kl": 0.01042283303104341, "learning_rate": 4.984600060360059e-07, "loss": 0.00010454654693603516, "reward": 0.7078750133514404, "reward_std": 0.31144317984580994, "rewards/DrugCombAccuracyCOTORM/mean": 0.663303554058075, "rewards/DrugCombAccuracyCOTORM/std": 0.4542328417301178, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7723214626312256, "rewards/DrugCombCoverageCOTORM/std": 0.3597228229045868, "step": 7492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/mean_length": 515.3125, "completions/min_length": 403.0, "epoch": 11.019117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.984386146068573, "kl": 0.019719573901966214, "learning_rate": 4.983316736636114e-07, "loss": 0.00018946576165035367, "reward": 0.6069305539131165, "reward_std": 0.05488958954811096, "rewards/DrugCombAccuracyCOTORM/mean": 0.5254166722297668, "rewards/DrugCombAccuracyCOTORM/std": 0.4998916685581207, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8659722208976746, "rewards/DrugCombCoverageCOTORM/std": 0.2016584873199463, "step": 7493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 460.875, "completions/min_length": 376.0, "epoch": 11.020588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.934208869934082, "kl": 0.010410260176286101, "learning_rate": 4.98203341401122e-07, "loss": 0.00010478538024472073, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 436.25, "completions/min_length": 396.0, "epoch": 11.022058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 1.0544824600219727, "kl": 0.009233595337718725, "learning_rate": 4.98075009256992e-07, "loss": 9.243190288543701e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 512.4375, "completions/min_length": 418.0, "epoch": 11.023529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.2933311462402344, "kl": 0.010073772398754954, "learning_rate": 4.979466772396756e-07, "loss": 0.00010100007057189941, "reward": 0.9041666984558105, "reward_std": 0.2384980171918869, "rewards/DrugCombAccuracyCOTORM/mean": 0.8958333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.26440009474754333, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 433.0625, "completions/min_length": 384.0, "epoch": 11.025, "frac_reward_zero_std": 1.0, "grad_norm": 0.02448991686105728, "kl": 0.011822568951174617, "learning_rate": 4.978183453576267e-07, "loss": 0.00011714440188370645, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 441.0625, "completions/min_length": 327.0, "epoch": 11.026470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.032342661172151566, "kl": 0.013232588535174727, "learning_rate": 4.976900136192998e-07, "loss": 0.00013184751151129603, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 447.25, "completions/min_length": 384.0, "epoch": 11.027941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.019888488575816154, "kl": 0.009664137964136899, "learning_rate": 4.975616820331489e-07, "loss": 9.664309618528932e-05, "reward": 0.6865000128746033, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6237499713897705, "rewards/DrugCombAccuracyCOTORM/std": 0.38858935236930847, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.12909944355487823, "step": 7499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 433.0, "completions/min_length": 385.0, "epoch": 11.029411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.019979678094387054, "kl": 0.01120678591541946, "learning_rate": 4.974333506076282e-07, "loss": 0.00011199934670003131, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 467.3125, "completions/min_length": 401.0, "epoch": 11.030882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.8867642879486084, "kl": 0.013728485675528646, "learning_rate": 4.97305019351192e-07, "loss": 0.00013786554336547852, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 7501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 481.9375, "completions/min_length": 403.0, "epoch": 11.032352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.02509392239153385, "kl": 0.010546286590397358, "learning_rate": 4.971766882722942e-07, "loss": 0.00010567494609858841, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 465.8125, "completions/min_length": 412.0, "epoch": 11.033823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.015557638369500637, "kl": 0.00956394150853157, "learning_rate": 4.970483573793891e-07, "loss": 9.551381663186476e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 433.6875, "completions/min_length": 367.0, "epoch": 11.035294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.1267733573913574, "kl": 0.013721752911806107, "learning_rate": 4.969200266809307e-07, "loss": 0.00013710558414459229, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 467.3125, "completions/min_length": 374.0, "epoch": 11.036764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.01286385953426361, "kl": 0.008350328775122762, "learning_rate": 4.967916961853732e-07, "loss": 8.363521192222834e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 456.25, "completions/min_length": 414.0, "epoch": 11.038235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.017704356461763382, "kl": 0.012354247970506549, "learning_rate": 4.966633659011705e-07, "loss": 0.00012307190627325326, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 476.5, "completions/min_length": 416.0, "epoch": 11.03970588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.012892268598079681, "kl": 0.009289370966143906, "learning_rate": 4.965350358367769e-07, "loss": 9.253606549464166e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/mean_length": 564.5, "completions/min_length": 498.0, "epoch": 11.041176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.01754402555525303, "kl": 0.010037033818662167, "learning_rate": 4.964067060006464e-07, "loss": 0.00010032959107775241, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 7508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 456.3125, "completions/min_length": 397.0, "epoch": 11.04264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.01515478640794754, "kl": 0.011450191494077444, "learning_rate": 4.96278376401233e-07, "loss": 0.00011349179840181023, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 461.3125, "completions/min_length": 426.0, "epoch": 11.044117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.023463837802410126, "kl": 0.008710968308150768, "learning_rate": 4.961500470469907e-07, "loss": 8.65214824443683e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 449.375, "completions/min_length": 415.0, "epoch": 11.045588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.02663194015622139, "kl": 0.009597042109817266, "learning_rate": 4.960217179463735e-07, "loss": 9.587677050149068e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 448.375, "completions/min_length": 403.0, "epoch": 11.047058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.7414588928222656, "kl": 0.008317710016854107, "learning_rate": 4.958933891078356e-07, "loss": 8.272309059975669e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 434.5625, "completions/min_length": 384.0, "epoch": 11.048529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.010299565270543098, "kl": 0.007384640164673328, "learning_rate": 4.957650605398309e-07, "loss": 7.429078686982393e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 433.6875, "completions/min_length": 395.0, "epoch": 11.05, "frac_reward_zero_std": 0.5, "grad_norm": 0.9448326230049133, "kl": 0.013093423331156373, "learning_rate": 4.956367322508131e-07, "loss": 0.00012956559658050537, "reward": 0.9551249742507935, "reward_std": 0.12692566215991974, "rewards/DrugCombAccuracyCOTORM/mean": 0.9478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.20874999463558197, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 7514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/mean_length": 512.5, "completions/min_length": 410.0, "epoch": 11.051470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 0.9029528498649597, "kl": 0.010378126055002213, "learning_rate": 4.955084042492364e-07, "loss": 0.00010448940884089097, "reward": 0.20625001192092896, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.0625, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 7515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 452.1875, "completions/min_length": 395.0, "epoch": 11.052941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.0166267156600952, "kl": 0.00851777195930481, "learning_rate": 4.953800765435546e-07, "loss": 8.576099935453385e-05, "reward": 0.7437499761581421, "reward_std": 0.21286733448505402, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 430.375, "completions/min_length": 384.0, "epoch": 11.054411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.012650694698095322, "kl": 0.008584104827605188, "learning_rate": 4.952517491422217e-07, "loss": 8.588036871515214e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/mean_length": 500.3125, "completions/min_length": 361.0, "epoch": 11.055882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.7832673788070679, "kl": 0.009892971953377128, "learning_rate": 4.951234220536916e-07, "loss": 9.575537114869803e-05, "reward": 0.8843749761581421, "reward_std": 0.21409589052200317, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 0.9375, "rewards/DrugCombCOTFormatORM/std": 0.17078252136707306, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 7518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 439.9375, "completions/min_length": 379.0, "epoch": 11.05735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.00765751488506794, "kl": 0.007951947744004428, "learning_rate": 4.949950952864182e-07, "loss": 7.917974289739504e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 433.0, "completions/min_length": 356.0, "epoch": 11.058823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.016028793528676033, "kl": 0.008323945454321802, "learning_rate": 4.948667688488551e-07, "loss": 8.21393114165403e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 461.6875, "completions/min_length": 429.0, "epoch": 11.060294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.02079608291387558, "kl": 0.008715323288924992, "learning_rate": 4.947384427494563e-07, "loss": 8.606167102698237e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 458.375, "completions/min_length": 404.0, "epoch": 11.061764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.01550858560949564, "kl": 0.008424910833127797, "learning_rate": 4.946101169966756e-07, "loss": 8.42086155898869e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 516.75, "completions/min_length": 439.0, "epoch": 11.063235294117646, "frac_reward_zero_std": 0.0, "grad_norm": 1.3767375946044922, "kl": 0.009675712324678898, "learning_rate": 4.944817915989668e-07, "loss": 9.692460298538208e-05, "reward": 0.7553958296775818, "reward_std": 0.28094014525413513, "rewards/DrugCombAccuracyCOTORM/mean": 0.7092187404632568, "rewards/DrugCombAccuracyCOTORM/std": 0.38776251673698425, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8802083730697632, "rewards/DrugCombCoverageCOTORM/std": 0.16092541813850403, "step": 7523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 436.1875, "completions/min_length": 387.0, "epoch": 11.064705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.022349951788783073, "kl": 0.0075123251881450415, "learning_rate": 4.943534665647835e-07, "loss": 7.552622992079705e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 454.875, "completions/min_length": 384.0, "epoch": 11.066176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.128159523010254, "kl": 0.012199715944007039, "learning_rate": 4.942251419025797e-07, "loss": 0.00012112036347389221, "reward": 0.5375000238418579, "reward_std": 0.051754921674728394, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 7525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 438.3125, "completions/min_length": 380.0, "epoch": 11.06764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.016294214874505997, "kl": 0.008020612061955035, "learning_rate": 4.940968176208088e-07, "loss": 8.010937017388642e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 420.0625, "completions/min_length": 354.0, "epoch": 11.069117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.007507546804845333, "kl": 0.007200988940894604, "learning_rate": 4.939684937279246e-07, "loss": 7.163469854276627e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 456.375, "completions/min_length": 428.0, "epoch": 11.070588235294117, "frac_reward_zero_std": 0.0, "grad_norm": 1.7559032440185547, "kl": 0.01708359783515334, "learning_rate": 4.938401702323808e-07, "loss": 0.00017116963863372803, "reward": 0.9339166879653931, "reward_std": 0.14292733371257782, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7291666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.4425306022167206, "step": 7528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/mean_length": 413.5625, "completions/min_length": 379.0, "epoch": 11.072058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.01780330017209053, "kl": 0.008985624881461263, "learning_rate": 4.937118471426309e-07, "loss": 8.990716014523059e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 427.5, "completions/min_length": 362.0, "epoch": 11.073529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.009628647938370705, "kl": 0.007911131717264652, "learning_rate": 4.935835244671288e-07, "loss": 7.934684981592e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 7530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 429.9375, "completions/min_length": 368.0, "epoch": 11.075, "frac_reward_zero_std": 1.0, "grad_norm": 0.01462298072874546, "kl": 0.008310085744597018, "learning_rate": 4.934552022143279e-07, "loss": 8.31021970952861e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/mean_length": 499.0, "completions/min_length": 395.0, "epoch": 11.076470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8450384736061096, "kl": 0.01611138053704053, "learning_rate": 4.933268803926815e-07, "loss": 0.00016759884601924568, "reward": 0.6625000238418579, "reward_std": 0.2133909910917282, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 7532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 434.8125, "completions/min_length": 397.0, "epoch": 11.077941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9002760052680969, "kl": 0.012631293386220932, "learning_rate": 4.931985590106435e-07, "loss": 0.00012753212649840862, "reward": 0.6794166564941406, "reward_std": 0.1366596668958664, "rewards/DrugCombAccuracyCOTORM/mean": 0.6175000071525574, "rewards/DrugCombAccuracyCOTORM/std": 0.45407047867774963, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.17078250646591187, "step": 7533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/mean_length": 544.1875, "completions/min_length": 472.0, "epoch": 11.079411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.8033215403556824, "kl": 0.009352180059067905, "learning_rate": 4.930702380766671e-07, "loss": 9.373513603350148e-05, "reward": 0.7965778112411499, "reward_std": 0.12903591990470886, "rewards/DrugCombAccuracyCOTORM/mean": 0.7735000252723694, "rewards/DrugCombAccuracyCOTORM/std": 0.3181341588497162, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7777777910232544, "rewards/DrugCombCoverageCOTORM/std": 0.25337231159210205, "step": 7534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 468.1875, "completions/min_length": 392.0, "epoch": 11.080882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.6437375545501709, "kl": 0.0096232695505023, "learning_rate": 4.929419175992059e-07, "loss": 9.583859355188906e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 431.875, "completions/min_length": 376.0, "epoch": 11.08235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.938850998878479, "kl": 0.010099826497025788, "learning_rate": 4.928135975867133e-07, "loss": 9.797529492061585e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 7536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 420.625, "completions/min_length": 382.0, "epoch": 11.083823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.012289110571146011, "kl": 0.008813646505586803, "learning_rate": 4.926852780476428e-07, "loss": 8.827130659483373e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 425.0, "completions/min_length": 368.0, "epoch": 11.08529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.013824455440044403, "kl": 0.007987747667357326, "learning_rate": 4.925569589904477e-07, "loss": 8.014442573767155e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 439.9375, "completions/min_length": 361.0, "epoch": 11.086764705882352, "frac_reward_zero_std": 0.5, "grad_norm": 0.8985738754272461, "kl": 0.013935796217992902, "learning_rate": 4.924286404235811e-07, "loss": 0.00013962434604763985, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 421.0, "completions/min_length": 358.0, "epoch": 11.088235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.012316286563873291, "kl": 0.009654931374825537, "learning_rate": 4.923003223554966e-07, "loss": 9.652112203184515e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 475.125, "completions/min_length": 433.0, "epoch": 11.089705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.025231488049030304, "kl": 0.009890199638903141, "learning_rate": 4.921720047946474e-07, "loss": 9.900773147819564e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 429.4375, "completions/min_length": 393.0, "epoch": 11.091176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.9089394211769104, "kl": 0.011180568020790815, "learning_rate": 4.920436877494867e-07, "loss": 0.00011300406913505867, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 7542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 485.5, "completions/min_length": 416.0, "epoch": 11.092647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.8262823224067688, "kl": 0.011820929124951363, "learning_rate": 4.919153712284678e-07, "loss": 0.00011783875379478559, "reward": 0.30000001192092896, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 7543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 420.5, "completions/min_length": 361.0, "epoch": 11.094117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.011663801968097687, "kl": 0.0086178322089836, "learning_rate": 4.917870552400437e-07, "loss": 8.67658163770102e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 427.1875, "completions/min_length": 381.0, "epoch": 11.095588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.010718487203121185, "kl": 0.007780098938383162, "learning_rate": 4.916587397926676e-07, "loss": 7.7604221587535e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 442.4375, "completions/min_length": 385.0, "epoch": 11.097058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.021263709291815758, "kl": 0.010762282880023122, "learning_rate": 4.915304248947926e-07, "loss": 0.00010787898645503446, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 422.0625, "completions/min_length": 381.0, "epoch": 11.098529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.010439325124025345, "kl": 0.006976849283091724, "learning_rate": 4.914021105548718e-07, "loss": 6.987740198383108e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 490.8125, "completions/min_length": 447.0, "epoch": 11.1, "frac_reward_zero_std": 0.0, "grad_norm": 1.1933133602142334, "kl": 0.008550602244213223, "learning_rate": 4.912737967813582e-07, "loss": 8.632242679595947e-05, "reward": 0.6079750061035156, "reward_std": 0.35563063621520996, "rewards/DrugCombAccuracyCOTORM/mean": 0.5185624957084656, "rewards/DrugCombAccuracyCOTORM/std": 0.49947547912597656, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9312499761581421, "rewards/DrugCombCoverageCOTORM/std": 0.1887458711862564, "step": 7548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 444.375, "completions/min_length": 382.0, "epoch": 11.101470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.021766886115074158, "kl": 0.009467228082939982, "learning_rate": 4.91145483582705e-07, "loss": 9.469076758250594e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 422.375, "completions/min_length": 382.0, "epoch": 11.102941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.011173377744853497, "kl": 0.008069552248343825, "learning_rate": 4.910171709673646e-07, "loss": 8.08282129582949e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 484.25, "completions/min_length": 424.0, "epoch": 11.104411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.0309092998504639, "kl": 0.011224651476368308, "learning_rate": 4.908888589437904e-07, "loss": 0.00011292099952697754, "reward": 0.8585500121116638, "reward_std": 0.19521954655647278, "rewards/DrugCombAccuracyCOTORM/mean": 0.8372499942779541, "rewards/DrugCombAccuracyCOTORM/std": 0.3499020040035248, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.887499988079071, "rewards/DrugCombCoverageCOTORM/std": 0.24186773598194122, "step": 7551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 413.75, "completions/min_length": 338.0, "epoch": 11.105882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.011563410051167011, "kl": 0.008501200587488711, "learning_rate": 4.907605475204351e-07, "loss": 8.43649759190157e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 474.8125, "completions/min_length": 411.0, "epoch": 11.10735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0027894973754883, "kl": 0.015231161378324032, "learning_rate": 4.906322367057515e-07, "loss": 0.0001520216464996338, "reward": 0.8708333373069763, "reward_std": 0.20504549145698547, "rewards/DrugCombAccuracyCOTORM/mean": 0.8541666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.3435921370983124, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 423.875, "completions/min_length": 373.0, "epoch": 11.108823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.014938508160412312, "kl": 0.00953556434251368, "learning_rate": 4.905039265081924e-07, "loss": 9.475932165514678e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 495.1875, "completions/min_length": 462.0, "epoch": 11.110294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.949296236038208, "kl": 0.00806365825701505, "learning_rate": 4.903756169362109e-07, "loss": 8.065998554229736e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 461.0625, "completions/min_length": 376.0, "epoch": 11.111764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.013321015052497387, "kl": 0.007348706596530974, "learning_rate": 4.902473079982591e-07, "loss": 7.368343358393759e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 466.25, "completions/min_length": 406.0, "epoch": 11.113235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.2176709175109863, "kl": 0.010897962842136621, "learning_rate": 4.9011899970279e-07, "loss": 0.00010893493890762329, "reward": 0.640625, "reward_std": 0.3128831386566162, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.2719528079032898, "step": 7557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 458.3125, "completions/min_length": 399.0, "epoch": 11.114705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.013540517538785934, "kl": 0.007963700802065432, "learning_rate": 4.899906920582563e-07, "loss": 8.004197297850624e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/mean_length": 541.125, "completions/min_length": 466.0, "epoch": 11.116176470588234, "frac_reward_zero_std": 0.5, "grad_norm": 0.8575135469436646, "kl": 0.011649447958916426, "learning_rate": 4.898623850731103e-07, "loss": 0.00011467933654785156, "reward": 0.7238333225250244, "reward_std": 0.13412657380104065, "rewards/DrugCombAccuracyCOTORM/mean": 0.6748809814453125, "rewards/DrugCombAccuracyCOTORM/std": 0.39752325415611267, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8392857313156128, "rewards/DrugCombCoverageCOTORM/std": 0.2142857015132904, "step": 7559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 406.5, "completions/min_length": 348.0, "epoch": 11.117647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.7976528406143188, "kl": 0.009257078287191689, "learning_rate": 4.897340787558049e-07, "loss": 9.258091449737549e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 467.125, "completions/min_length": 373.0, "epoch": 11.119117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.0545604228973389, "kl": 0.009162110276520252, "learning_rate": 4.896057731147924e-07, "loss": 9.027868509292603e-05, "reward": 0.4375, "reward_std": 0.22638463973999023, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/mean_length": 497.875, "completions/min_length": 426.0, "epoch": 11.120588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.3988628387451172, "kl": 0.011380119482055306, "learning_rate": 4.894774681585251e-07, "loss": 0.00011450052261352539, "reward": 0.6499999761581421, "reward_std": 0.3265853524208069, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 461.8125, "completions/min_length": 419.0, "epoch": 11.12205882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.03546130284667015, "kl": 0.01154810725711286, "learning_rate": 4.893491638954557e-07, "loss": 0.00011647624342003837, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 430.25, "completions/min_length": 379.0, "epoch": 11.123529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.07993853092193604, "kl": 0.01319266832433641, "learning_rate": 4.892208603340363e-07, "loss": 0.00013129858416505158, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 418.25, "completions/min_length": 336.0, "epoch": 11.125, "frac_reward_zero_std": 0.5, "grad_norm": 1.0606061220169067, "kl": 0.007232946460135281, "learning_rate": 4.890925574827194e-07, "loss": 7.198587991297245e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 459.3125, "completions/min_length": 416.0, "epoch": 11.126470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.1165685653686523, "kl": 0.011230163741856813, "learning_rate": 4.889642553499573e-07, "loss": 0.00011401813389966264, "reward": 0.675000011920929, "reward_std": 0.20177781581878662, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 7566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 441.0, "completions/min_length": 415.0, "epoch": 11.12794117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.057461142539978, "kl": 0.010312747675925493, "learning_rate": 4.888359539442022e-07, "loss": 0.00010363002365920693, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 437.125, "completions/min_length": 375.0, "epoch": 11.129411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.012860334478318691, "kl": 0.00946057413239032, "learning_rate": 4.88707653273906e-07, "loss": 9.438724373467267e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 491.8125, "completions/min_length": 458.0, "epoch": 11.130882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 1.1983847618103027, "kl": 0.01126818172633648, "learning_rate": 4.885793533475211e-07, "loss": 0.00011272728443145752, "reward": 0.23750001192092896, "reward_std": 0.1767767071723938, "rewards/DrugCombAccuracyCOTORM/mean": 0.0625, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 449.8125, "completions/min_length": 401.0, "epoch": 11.132352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.012687860056757927, "kl": 0.007825398817658424, "learning_rate": 4.884510541734994e-07, "loss": 7.847080269129947e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 419.3125, "completions/min_length": 329.0, "epoch": 11.133823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.01295036543160677, "kl": 0.008685056120157242, "learning_rate": 4.883227557602932e-07, "loss": 8.757496834732592e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 475.375, "completions/min_length": 393.0, "epoch": 11.135294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.009982253424823284, "kl": 0.007735958555713296, "learning_rate": 4.881944581163542e-07, "loss": 7.762460154481232e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 445.75, "completions/min_length": 391.0, "epoch": 11.136764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.4004968404769897, "kl": 0.01248488062992692, "learning_rate": 4.880661612501345e-07, "loss": 0.00012364983558654785, "reward": 0.7437499761581421, "reward_std": 0.3729080259799957, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 435.8125, "completions/min_length": 409.0, "epoch": 11.138235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.864641547203064, "kl": 0.01026115589775145, "learning_rate": 4.879378651700859e-07, "loss": 0.00010272931831423193, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 483.1875, "completions/min_length": 401.0, "epoch": 11.139705882352942, "frac_reward_zero_std": 0.0, "grad_norm": 1.1975828409194946, "kl": 0.00977688143029809, "learning_rate": 4.878095698846602e-07, "loss": 9.666383266448975e-05, "reward": 0.5874999761581421, "reward_std": 0.39518100023269653, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/mean_length": 502.9375, "completions/min_length": 460.0, "epoch": 11.141176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.0451016426086426, "kl": 0.01029041747096926, "learning_rate": 4.87681275402309e-07, "loss": 0.00010379403829574585, "reward": 0.8569583296775818, "reward_std": 0.16276514530181885, "rewards/DrugCombAccuracyCOTORM/mean": 0.8329166769981384, "rewards/DrugCombAccuracyCOTORM/std": 0.31513404846191406, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.1717960685491562, "step": 7576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/mean_length": 517.4375, "completions/min_length": 396.0, "epoch": 11.14264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8623523116111755, "kl": 0.013433239422738552, "learning_rate": 4.875529817314844e-07, "loss": 0.0001352880644844845, "reward": 0.8901041746139526, "reward_std": 0.09437784552574158, "rewards/DrugCombAccuracyCOTORM/mean": 0.8645833730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.21273136138916016, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.042695630341768265, "step": 7577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 379.0625, "completions/min_length": 310.0, "epoch": 11.144117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.5569736957550049, "kl": 0.01525311975274235, "learning_rate": 4.874246888806378e-07, "loss": 0.00014562904834747314, "reward": 0.6546041965484619, "reward_std": 0.047317225486040115, "rewards/DrugCombAccuracyCOTORM/mean": 0.5962499976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.4203629493713379, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.2687419056892395, "step": 7578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 475.5, "completions/min_length": 369.0, "epoch": 11.145588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.0240869522094727, "kl": 0.01541155343875289, "learning_rate": 4.87296396858221e-07, "loss": 0.0001530870795249939, "reward": 0.6430833339691162, "reward_std": 0.012376055121421814, "rewards/DrugCombAccuracyCOTORM/mean": 0.5824999809265137, "rewards/DrugCombAccuracyCOTORM/std": 0.43165960907936096, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7708333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.24247947335243225, "step": 7579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 435.6875, "completions/min_length": 342.0, "epoch": 11.147058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.08007163554430008, "kl": 0.013860701816156507, "learning_rate": 4.871681056726852e-07, "loss": 0.00013825861969962716, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 426.9375, "completions/min_length": 378.0, "epoch": 11.148529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.011098039336502552, "kl": 0.008852016529999673, "learning_rate": 4.870398153324823e-07, "loss": 8.827526471577585e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 437.0625, "completions/min_length": 351.0, "epoch": 11.15, "frac_reward_zero_std": 1.0, "grad_norm": 0.011444959789514542, "kl": 0.009686075267381966, "learning_rate": 4.869115258460634e-07, "loss": 9.693455649539828e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 493.3125, "completions/min_length": 411.0, "epoch": 11.151470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.8015999794006348, "kl": 0.00948219618294388, "learning_rate": 4.867832372218802e-07, "loss": 9.507415961707011e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 7583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 480.1875, "completions/min_length": 415.0, "epoch": 11.152941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9588125944137573, "kl": 0.010126221342943609, "learning_rate": 4.866549494683838e-07, "loss": 0.0001003555953502655, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 7584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 475.25, "completions/min_length": 435.0, "epoch": 11.154411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.01473994366824627, "kl": 0.009195795748382807, "learning_rate": 4.865266625940258e-07, "loss": 9.24455453059636e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 480.6875, "completions/min_length": 400.0, "epoch": 11.155882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.01628708839416504, "kl": 0.010457372991368175, "learning_rate": 4.863983766072569e-07, "loss": 0.00010455766459926963, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/mean_length": 464.9375, "completions/min_length": 406.0, "epoch": 11.157352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.0932105779647827, "kl": 0.010958327678963542, "learning_rate": 4.862700915165286e-07, "loss": 0.00010926784307230264, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 7587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 452.125, "completions/min_length": 389.0, "epoch": 11.158823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.06494398415088654, "kl": 0.013327938970178366, "learning_rate": 4.861418073302918e-07, "loss": 0.0001331781968474388, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/mean_length": 491.25, "completions/min_length": 415.0, "epoch": 11.160294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.4630126953125, "kl": 0.013345963088795543, "learning_rate": 4.860135240569978e-07, "loss": 0.00013460218906402588, "reward": 0.612500011920929, "reward_std": 0.34973084926605225, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 457.5, "completions/min_length": 422.0, "epoch": 11.161764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.08744250982999802, "kl": 0.015251552918925881, "learning_rate": 4.858852417050974e-07, "loss": 0.0001512497547082603, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 404.4375, "completions/min_length": 340.0, "epoch": 11.163235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.013069181703031063, "kl": 0.007474201382137835, "learning_rate": 4.857569602830418e-07, "loss": 7.440161425620317e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 441.3125, "completions/min_length": 371.0, "epoch": 11.16470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.0278041362762451, "kl": 0.009777239058166742, "learning_rate": 4.856286797992813e-07, "loss": 9.763834532350302e-05, "reward": 0.9052083492279053, "reward_std": 0.10225021839141846, "rewards/DrugCombAccuracyCOTORM/mean": 0.8854166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.2083333432674408, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 7592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/mean_length": 430.8125, "completions/min_length": 343.0, "epoch": 11.166176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 1.3614987134933472, "kl": 0.010942140477709472, "learning_rate": 4.85500400262267e-07, "loss": 0.00010907649993896484, "reward": 0.5892499685287476, "reward_std": 0.40559133887290955, "rewards/DrugCombAccuracyCOTORM/mean": 0.5412499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.4801371693611145, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.6962199807167053, "step": 7593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/mean_length": 507.0625, "completions/min_length": 439.0, "epoch": 11.16764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0136418342590332, "kl": 0.012548702768981457, "learning_rate": 4.853721216804499e-07, "loss": 0.00012509990483522415, "reward": 0.8802083730697632, "reward_std": 0.10019201785326004, "rewards/DrugCombAccuracyCOTORM/mean": 0.8541666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.22669117152690887, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 7594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 452.875, "completions/min_length": 400.0, "epoch": 11.169117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.01200831588357687, "kl": 0.007388933910988271, "learning_rate": 4.852438440622802e-07, "loss": 7.3718634666875e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 482.5625, "completions/min_length": 394.0, "epoch": 11.170588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.8748986124992371, "kl": 0.012179981335066259, "learning_rate": 4.851155674162086e-07, "loss": 0.00012156367301940918, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 477.4375, "completions/min_length": 417.0, "epoch": 11.172058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.4325222969055176, "kl": 0.015833056531846523, "learning_rate": 4.849872917506861e-07, "loss": 0.00016448646783828735, "reward": 0.4749999940395355, "reward_std": 0.38490793108940125, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 7597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/mean_length": 500.625, "completions/min_length": 434.0, "epoch": 11.173529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8458705544471741, "kl": 0.00954517989885062, "learning_rate": 4.848590170741626e-07, "loss": 9.58368182182312e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 449.375, "completions/min_length": 398.0, "epoch": 11.175, "frac_reward_zero_std": 0.5, "grad_norm": 1.0400141477584839, "kl": 0.010432132752612233, "learning_rate": 4.847307433950887e-07, "loss": 0.00010423391358926892, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 442.125, "completions/min_length": 416.0, "epoch": 11.176470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.022689208388328552, "kl": 0.00872724805958569, "learning_rate": 4.846024707219149e-07, "loss": 8.681300823809579e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/mean_length": 506.375, "completions/min_length": 394.0, "epoch": 11.177941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.0578659772872925, "kl": 0.009908688254654408, "learning_rate": 4.844741990630912e-07, "loss": 9.791526827029884e-05, "reward": 0.6488125324249268, "reward_std": 0.07651638984680176, "rewards/DrugCombAccuracyCOTORM/mean": 0.6000781059265137, "rewards/DrugCombAccuracyCOTORM/std": 0.43051791191101074, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.34359216690063477, "step": 7601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 426.75, "completions/min_length": 396.0, "epoch": 11.179411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.008922099135816097, "kl": 0.0066778401378542185, "learning_rate": 4.843459284270681e-07, "loss": 6.646307156188414e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 439.25, "completions/min_length": 385.0, "epoch": 11.180882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.008098547346889973, "kl": 0.006711226888000965, "learning_rate": 4.842176588222957e-07, "loss": 6.748511805199087e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 7603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 429.1875, "completions/min_length": 383.0, "epoch": 11.18235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.028236085548996925, "kl": 0.008595437044277787, "learning_rate": 4.84089390257224e-07, "loss": 8.574387175031006e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 427.5625, "completions/min_length": 359.0, "epoch": 11.183823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.012299616821110249, "kl": 0.00957690307404846, "learning_rate": 4.839611227403028e-07, "loss": 9.55938157858327e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/mean_length": 466.0625, "completions/min_length": 418.0, "epoch": 11.185294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9758877754211426, "kl": 0.013551721232943237, "learning_rate": 4.838328562799824e-07, "loss": 0.00013796435086987913, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 7606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 456.625, "completions/min_length": 386.0, "epoch": 11.186764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.01730765961110592, "kl": 0.009932048385962844, "learning_rate": 4.837045908847124e-07, "loss": 9.936615242622793e-05, "reward": 0.625333309173584, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5733333230018616, "rewards/DrugCombAccuracyCOTORM/std": 0.44065946340560913, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.3442651927471161, "step": 7607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 442.9375, "completions/min_length": 352.0, "epoch": 11.188235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.015751169994473457, "kl": 0.008468881365843117, "learning_rate": 4.835763265629428e-07, "loss": 8.48098861752078e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 7608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 447.125, "completions/min_length": 395.0, "epoch": 11.189705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.014584542252123356, "kl": 0.00764664018061012, "learning_rate": 4.834480633231233e-07, "loss": 7.663779251743108e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 7609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 446.5625, "completions/min_length": 383.0, "epoch": 11.191176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.011960667558014393, "kl": 0.008019496686756611, "learning_rate": 4.833198011737034e-07, "loss": 7.985349657246843e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 7610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/mean_length": 467.1875, "completions/min_length": 403.0, "epoch": 11.19264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8527632355690002, "kl": 0.011503288871608675, "learning_rate": 4.831915401231328e-07, "loss": 0.00011658377479761839, "reward": 0.8511833548545837, "reward_std": 0.0013670751359313726, "rewards/DrugCombAccuracyCOTORM/mean": 0.831166684627533, "rewards/DrugCombAccuracyCOTORM/std": 0.1745712161064148, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.862500011920929, "rewards/DrugCombCoverageCOTORM/std": 0.15000000596046448, "step": 7611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/mean_length": 499.8125, "completions/min_length": 401.0, "epoch": 11.194117647058823, "frac_reward_zero_std": 0.0, "grad_norm": 1.2503516674041748, "kl": 0.013923670165240765, "learning_rate": 4.830632801798611e-07, "loss": 0.00013878941535949707, "reward": 0.5910714268684387, "reward_std": 0.3221935033798218, "rewards/DrugCombAccuracyCOTORM/mean": 0.5357142686843872, "rewards/DrugCombAccuracyCOTORM/std": 0.49897855520248413, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 7612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/mean_length": 545.4375, "completions/min_length": 464.0, "epoch": 11.195588235294117, "frac_reward_zero_std": 0.0, "grad_norm": 1.2445619106292725, "kl": 0.013169853016734123, "learning_rate": 4.829350213523374e-07, "loss": 0.00013045966625213623, "reward": 0.658750057220459, "reward_std": 0.34391191601753235, "rewards/DrugCombAccuracyCOTORM/mean": 0.612500011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.4193248450756073, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.5370530486106873, "step": 7613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 451.5, "completions/min_length": 386.0, "epoch": 11.197058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.449163794517517, "kl": 0.011475053615868092, "learning_rate": 4.828067636490115e-07, "loss": 0.00011589378118515015, "reward": 0.7250000238418579, "reward_std": 0.4286505877971649, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 7614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 481.5, "completions/min_length": 454.0, "epoch": 11.198529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 1.1235315799713135, "kl": 0.010769518092274666, "learning_rate": 4.826785070783326e-07, "loss": 0.00010710509377531707, "reward": 0.8589166402816772, "reward_std": 0.19595398008823395, "rewards/DrugCombAccuracyCOTORM/mean": 0.8262500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.3764195442199707, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 7615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 465.375, "completions/min_length": 416.0, "epoch": 11.2, "frac_reward_zero_std": 1.0, "grad_norm": 0.01750260218977928, "kl": 0.0059320456348359585, "learning_rate": 4.825502516487496e-07, "loss": 5.9502021031221375e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 465.875, "completions/min_length": 418.0, "epoch": 11.201470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.3743473291397095, "kl": 0.011922611854970455, "learning_rate": 4.82421997368712e-07, "loss": 0.00011862069368362427, "reward": 0.4625000059604645, "reward_std": 0.23691894114017487, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 7617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 456.625, "completions/min_length": 387.0, "epoch": 11.202941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9271535277366638, "kl": 0.009990425547584891, "learning_rate": 4.822937442466686e-07, "loss": 0.00010011447011493146, "reward": 0.5, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 432.875, "completions/min_length": 374.0, "epoch": 11.204411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.0341897010803223, "kl": 0.012970083276741207, "learning_rate": 4.821654922910686e-07, "loss": 0.00012903288006782532, "reward": 0.6299999952316284, "reward_std": 0.05796550586819649, "rewards/DrugCombAccuracyCOTORM/mean": 0.5687500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.45213383436203003, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.3333333432674408, "step": 7619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 503.0625, "completions/min_length": 437.0, "epoch": 11.205882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.03876449167728424, "kl": 0.01273907138966024, "learning_rate": 4.820372415103607e-07, "loss": 0.00012617513129953295, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 447.4375, "completions/min_length": 404.0, "epoch": 11.20735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.179247498512268, "kl": 0.008973756805062294, "learning_rate": 4.819089919129942e-07, "loss": 8.868565782904625e-05, "reward": 0.6053333282470703, "reward_std": 0.1772606372833252, "rewards/DrugCombAccuracyCOTORM/mean": 0.5900000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.4849192500114441, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3333333432674408, "rewards/DrugCombCoverageCOTORM/std": 0.9349193572998047, "step": 7621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 460.25, "completions/min_length": 398.0, "epoch": 11.208823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.8748634457588196, "kl": 0.01257233670912683, "learning_rate": 4.817807435074172e-07, "loss": 0.00012488612264860421, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 7622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/mean_length": 505.875, "completions/min_length": 411.0, "epoch": 11.21029411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.3334158658981323, "kl": 0.016134100616909564, "learning_rate": 4.816524963020789e-07, "loss": 0.00015440583229064941, "reward": 0.4753749966621399, "reward_std": 0.26214712858200073, "rewards/DrugCombAccuracyCOTORM/mean": 0.39500001072883606, "rewards/DrugCombAccuracyCOTORM/std": 0.4291260540485382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.59375, "rewards/DrugCombCoverageCOTORM/std": 0.48196646571159363, "step": 7623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 412.9375, "completions/min_length": 382.0, "epoch": 11.211764705882352, "frac_reward_zero_std": 1.0, "grad_norm": 0.009674531407654285, "kl": 0.008404827676713467, "learning_rate": 4.815242503054276e-07, "loss": 8.407244604313746e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 475.0, "completions/min_length": 410.0, "epoch": 11.213235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.1872097253799438, "kl": 0.012369799660518765, "learning_rate": 4.813960055259119e-07, "loss": 0.00012513995170593262, "reward": 0.8937499523162842, "reward_std": 0.3005203604698181, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/mean_length": 544.625, "completions/min_length": 503.0, "epoch": 11.214705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.7361437678337097, "kl": 0.006786566926166415, "learning_rate": 4.812677619719805e-07, "loss": 6.748735904693604e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/mean_length": 520.9375, "completions/min_length": 416.0, "epoch": 11.216176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.7630000114440918, "kl": 0.00969896896276623, "learning_rate": 4.811395196520814e-07, "loss": 9.557970042806119e-05, "reward": 0.6260208487510681, "reward_std": 0.07702043652534485, "rewards/DrugCombAccuracyCOTORM/mean": 0.5338281393051147, "rewards/DrugCombAccuracyCOTORM/std": 0.49889159202575684, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9895833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.041666675359010696, "step": 7627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 432.0625, "completions/min_length": 375.0, "epoch": 11.217647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.008913801982998848, "kl": 0.007727077696472406, "learning_rate": 4.81011278574663e-07, "loss": 7.73775827838108e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 439.9375, "completions/min_length": 395.0, "epoch": 11.219117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.060702092945575714, "kl": 0.011739005567505956, "learning_rate": 4.808830387481735e-07, "loss": 0.00011819499195553362, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 488.875, "completions/min_length": 422.0, "epoch": 11.220588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9918481707572937, "kl": 0.009322278900071979, "learning_rate": 4.80754800181061e-07, "loss": 9.262798994313926e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 495.875, "completions/min_length": 433.0, "epoch": 11.222058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.1593791246414185, "kl": 0.007246787194162607, "learning_rate": 4.806265628817736e-07, "loss": 7.192045450210571e-05, "reward": 0.8928333520889282, "reward_std": 0.22285377979278564, "rewards/DrugCombAccuracyCOTORM/mean": 0.8712500333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.2830282747745514, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 7631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 457.8125, "completions/min_length": 402.0, "epoch": 11.223529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.9608145356178284, "kl": 0.01033815264236182, "learning_rate": 4.804983268587592e-07, "loss": 0.00010267027391819283, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 7632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 412.875, "completions/min_length": 332.0, "epoch": 11.225, "frac_reward_zero_std": 1.0, "grad_norm": 0.024751853197813034, "kl": 0.010810724692419171, "learning_rate": 4.803700921204658e-07, "loss": 0.00010808233491843566, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 434.75, "completions/min_length": 354.0, "epoch": 11.226470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.03530427813529968, "kl": 0.010333452839404345, "learning_rate": 4.802418586753409e-07, "loss": 0.00010316394036635756, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 476.6875, "completions/min_length": 427.0, "epoch": 11.227941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.013905921019613743, "kl": 0.008721623686142266, "learning_rate": 4.801136265318325e-07, "loss": 8.658271690364927e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 413.5, "completions/min_length": 368.0, "epoch": 11.229411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.01753637194633484, "kl": 0.007777800899930298, "learning_rate": 4.799853956983879e-07, "loss": 7.787297363393009e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 404.125, "completions/min_length": 353.0, "epoch": 11.230882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.011656958609819412, "kl": 0.006260066176764667, "learning_rate": 4.798571661834547e-07, "loss": 6.262485112529248e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 417.0, "completions/min_length": 376.0, "epoch": 11.23235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.014989527873694897, "kl": 0.010403551626950502, "learning_rate": 4.797289379954806e-07, "loss": 0.0001040638962876983, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 479.9375, "completions/min_length": 428.0, "epoch": 11.233823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.7449353933334351, "kl": 0.007607559207826853, "learning_rate": 4.796007111429128e-07, "loss": 7.573515176773071e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 443.5, "completions/min_length": 407.0, "epoch": 11.235294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.9094823598861694, "kl": 0.011879507452249527, "learning_rate": 4.794724856341984e-07, "loss": 0.00011780665954574943, "reward": 0.9051250219345093, "reward_std": 0.17601576447486877, "rewards/DrugCombAccuracyCOTORM/mean": 0.8853124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.314830482006073, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 7640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 475.8125, "completions/min_length": 390.0, "epoch": 11.236764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.79123455286026, "kl": 0.008688654634170234, "learning_rate": 4.793442614777848e-07, "loss": 8.69528012117371e-05, "reward": 0.6000000238418579, "reward_std": 0.16903086006641388, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 7641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 444.0, "completions/min_length": 401.0, "epoch": 11.238235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.3363457918167114, "kl": 0.011153432773426175, "learning_rate": 4.792160386821189e-07, "loss": 0.00011157813423778862, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 425.8125, "completions/min_length": 368.0, "epoch": 11.239705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.03135376796126366, "kl": 0.008001435897313058, "learning_rate": 4.790878172556477e-07, "loss": 7.960648508742452e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 407.25, "completions/min_length": 332.0, "epoch": 11.241176470588234, "frac_reward_zero_std": 0.5, "grad_norm": 0.932277500629425, "kl": 0.009518075501546264, "learning_rate": 4.789595972068183e-07, "loss": 9.665334800956771e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 477.0625, "completions/min_length": 392.0, "epoch": 11.242647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.2223801612854004, "kl": 0.01156468759290874, "learning_rate": 4.788313785440775e-07, "loss": 0.00011502206325531006, "reward": 0.6048749685287476, "reward_std": 0.07347539812326431, "rewards/DrugCombAccuracyCOTORM/mean": 0.5490624904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.47694242000579834, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.65625, "rewards/DrugCombCoverageCOTORM/std": 0.5108770728111267, "step": 7645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 458.3125, "completions/min_length": 394.0, "epoch": 11.244117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.013200851157307625, "kl": 0.006402598577551544, "learning_rate": 4.787031612758717e-07, "loss": 6.348703027470037e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 494.875, "completions/min_length": 437.0, "epoch": 11.245588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.8904019594192505, "kl": 0.017257785890251398, "learning_rate": 4.785749454106478e-07, "loss": 0.00017160199058707803, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 7647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 434.4375, "completions/min_length": 372.0, "epoch": 11.24705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.01589754782617092, "kl": 0.008418445708230138, "learning_rate": 4.784467309568523e-07, "loss": 8.471264300169423e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 466.375, "completions/min_length": 394.0, "epoch": 11.248529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.020489167422056198, "kl": 0.010620080633088946, "learning_rate": 4.783185179229316e-07, "loss": 0.00010557528003118932, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 471.0, "completions/min_length": 420.0, "epoch": 11.25, "frac_reward_zero_std": 0.0, "grad_norm": 1.568566918373108, "kl": 0.01273322431370616, "learning_rate": 4.78190306317332e-07, "loss": 0.0001272261142730713, "reward": 0.8500000238418579, "reward_std": 0.3265853524208069, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 447.4375, "completions/min_length": 362.0, "epoch": 11.251470588235295, "frac_reward_zero_std": 0.0, "grad_norm": 1.1608484983444214, "kl": 0.00993780407588929, "learning_rate": 4.780620961485e-07, "loss": 9.897351264953613e-05, "reward": 0.887499988079071, "reward_std": 0.3181980550289154, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 7651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 471.25, "completions/min_length": 379.0, "epoch": 11.25294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.013452038168907166, "kl": 0.00888275180477649, "learning_rate": 4.779338874248815e-07, "loss": 8.91505551408045e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 423.875, "completions/min_length": 367.0, "epoch": 11.254411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.009121008217334747, "kl": 0.005914195440709591, "learning_rate": 4.778056801549226e-07, "loss": 5.9072750445920974e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/mean_length": 403.25, "completions/min_length": 381.0, "epoch": 11.255882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.010283036157488823, "kl": 0.007550726877525449, "learning_rate": 4.776774743470694e-07, "loss": 7.558964716736227e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 475.9375, "completions/min_length": 419.0, "epoch": 11.257352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.08100385218858719, "kl": 0.013464075978845358, "learning_rate": 4.775492700097677e-07, "loss": 0.00013524630048777908, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/mean_length": 416.1875, "completions/min_length": 329.0, "epoch": 11.258823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 0.8949626088142395, "kl": 0.007873870432376862, "learning_rate": 4.774210671514632e-07, "loss": 7.931143045425415e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 472.625, "completions/min_length": 403.0, "epoch": 11.260294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.01572291925549507, "kl": 0.00798427511472255, "learning_rate": 4.772928657806018e-07, "loss": 8.017242362257093e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/mean_length": 497.125, "completions/min_length": 403.0, "epoch": 11.261764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.5516661405563354, "kl": 0.0131606743671, "learning_rate": 4.771646659056288e-07, "loss": 0.00012978911399841309, "reward": 0.7533957958221436, "reward_std": 0.35672086477279663, "rewards/DrugCombAccuracyCOTORM/mean": 0.6976041793823242, "rewards/DrugCombAccuracyCOTORM/std": 0.42737138271331787, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.953125, "rewards/DrugCombCoverageCOTORM/std": 0.10077822208404541, "step": 7658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/mean_length": 415.25, "completions/min_length": 384.0, "epoch": 11.263235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01097166072577238, "kl": 0.008375253062695265, "learning_rate": 4.770364675349898e-07, "loss": 8.29183409223333e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 483.375, "completions/min_length": 409.0, "epoch": 11.264705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.02380339801311493, "kl": 0.011835331562906504, "learning_rate": 4.769082706771303e-07, "loss": 0.00011787181574618444, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 456.5, "completions/min_length": 365.0, "epoch": 11.266176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.022226564586162567, "kl": 0.012483075028285384, "learning_rate": 4.767800753404954e-07, "loss": 0.00012675972539000213, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 460.5, "completions/min_length": 413.0, "epoch": 11.26764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0316351652145386, "kl": 0.012501150835305452, "learning_rate": 4.766518815335304e-07, "loss": 0.0001245973980985582, "reward": 0.5979166626930237, "reward_std": 0.005892557092010975, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 7662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 452.625, "completions/min_length": 416.0, "epoch": 11.269117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.8189749121665955, "kl": 0.012150870636105537, "learning_rate": 4.765236892646805e-07, "loss": 0.00012195737508591264, "reward": 0.6526666879653931, "reward_std": 0.14412209391593933, "rewards/DrugCombAccuracyCOTORM/mean": 0.5762500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.49902406334877014, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 7663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 420.1875, "completions/min_length": 391.0, "epoch": 11.270588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.010175362229347229, "kl": 0.0071229442255571485, "learning_rate": 4.7639549854239043e-07, "loss": 7.081679359544069e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 509.4375, "completions/min_length": 460.0, "epoch": 11.272058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 0.9540910720825195, "kl": 0.010731431772001088, "learning_rate": 4.762673093751052e-07, "loss": 0.00010725855827331543, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 460.9375, "completions/min_length": 411.0, "epoch": 11.273529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9092597961425781, "kl": 0.01021835277788341, "learning_rate": 4.7613912177126946e-07, "loss": 0.00010158121585845947, "reward": 0.9178333282470703, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.9025000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.26642072200775146, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 7666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 413.625, "completions/min_length": 336.0, "epoch": 11.275, "frac_reward_zero_std": 1.0, "grad_norm": 0.026090873405337334, "kl": 0.008892541285604239, "learning_rate": 4.760109357393281e-07, "loss": 8.832977619022131e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 424.75, "completions/min_length": 328.0, "epoch": 11.276470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.1140583753585815, "kl": 0.01045443641487509, "learning_rate": 4.758827512877255e-07, "loss": 0.00010402046609669924, "reward": 0.942187488079071, "reward_std": 0.16351844370365143, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 417.0625, "completions/min_length": 338.0, "epoch": 11.277941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.1861218214035034, "kl": 0.010822291253134608, "learning_rate": 4.757545684249064e-07, "loss": 0.00010894984006881714, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 449.75, "completions/min_length": 385.0, "epoch": 11.279411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.8373274207115173, "kl": 0.012421161169186234, "learning_rate": 4.756263871593149e-07, "loss": 0.00012509981752373278, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 475.0, "completions/min_length": 438.0, "epoch": 11.280882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.012486536055803299, "kl": 0.00810125912539661, "learning_rate": 4.7549820749939516e-07, "loss": 8.102523861452937e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 7671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 405.5625, "completions/min_length": 349.0, "epoch": 11.282352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.0097535848617554, "kl": 0.008708734065294266, "learning_rate": 4.7537002945359154e-07, "loss": 8.7526990682818e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 453.8125, "completions/min_length": 376.0, "epoch": 11.283823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.014143112115561962, "kl": 0.00748594431206584, "learning_rate": 4.7524185303034814e-07, "loss": 7.481200009351596e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 413.625, "completions/min_length": 343.0, "epoch": 11.285294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.4495298862457275, "kl": 0.01043297303840518, "learning_rate": 4.7511367823810877e-07, "loss": 0.00010352581739425659, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 7674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 411.625, "completions/min_length": 345.0, "epoch": 11.286764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8002133369445801, "kl": 0.010143046616576612, "learning_rate": 4.7498550508531733e-07, "loss": 0.00010165090498048812, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 455.625, "completions/min_length": 377.0, "epoch": 11.288235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.012465918436646461, "kl": 0.00781497999560088, "learning_rate": 4.748573335804175e-07, "loss": 7.867905515013263e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 435.75, "completions/min_length": 390.0, "epoch": 11.28970588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.021369706839323044, "kl": 0.008534350548870862, "learning_rate": 4.747291637318529e-07, "loss": 8.516223897458985e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 424.75, "completions/min_length": 334.0, "epoch": 11.291176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.008677776902914047, "kl": 0.007681982358917594, "learning_rate": 4.7460099554806715e-07, "loss": 7.68752652220428e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 463.4375, "completions/min_length": 414.0, "epoch": 11.29264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.008457300253212452, "kl": 0.007214336888864636, "learning_rate": 4.744728290375034e-07, "loss": 7.220737461466342e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 442.25, "completions/min_length": 377.0, "epoch": 11.294117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.019350402057170868, "kl": 0.008062982582487166, "learning_rate": 4.743446642086051e-07, "loss": 8.108778274618089e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 442.0625, "completions/min_length": 374.0, "epoch": 11.295588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.00947120226919651, "kl": 0.006568174809217453, "learning_rate": 4.742165010698154e-07, "loss": 6.559006578754634e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 430.625, "completions/min_length": 376.0, "epoch": 11.297058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.014355845749378204, "kl": 0.007225847686640918, "learning_rate": 4.7408833962957734e-07, "loss": 7.322469173232093e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/mean_length": 414.9375, "completions/min_length": 387.0, "epoch": 11.298529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.011879252269864082, "kl": 0.010420678881928325, "learning_rate": 4.739601798963339e-07, "loss": 0.0001044487944454886, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 436.4375, "completions/min_length": 360.0, "epoch": 11.3, "frac_reward_zero_std": 1.0, "grad_norm": 0.011274519376456738, "kl": 0.007126508047804236, "learning_rate": 4.7383202187852804e-07, "loss": 7.092018495313823e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 7684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 451.4375, "completions/min_length": 413.0, "epoch": 11.301470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.017724886536598206, "kl": 0.01081017404794693, "learning_rate": 4.7370386558460224e-07, "loss": 0.00010712840594351292, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 427.75, "completions/min_length": 375.0, "epoch": 11.302941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.478998064994812, "kl": 0.013288932852447033, "learning_rate": 4.735757110229992e-07, "loss": 0.00013474375009536743, "reward": 0.8374999761581421, "reward_std": 0.3619407117366791, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 427.3125, "completions/min_length": 377.0, "epoch": 11.304411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.010870326310396194, "kl": 0.008095312165096402, "learning_rate": 4.734475582021615e-07, "loss": 8.145505853462964e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/mean_length": 530.1875, "completions/min_length": 454.0, "epoch": 11.305882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.1694142818450928, "kl": 0.009813740965910256, "learning_rate": 4.733194071305314e-07, "loss": 9.790243348106742e-05, "reward": 0.6068750023841858, "reward_std": 0.04709697514772415, "rewards/DrugCombAccuracyCOTORM/mean": 0.551562488079071, "rewards/DrugCombAccuracyCOTORM/std": 0.46680375933647156, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.65625, "rewards/DrugCombCoverageCOTORM/std": 0.3966001570224762, "step": 7688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 462.6875, "completions/min_length": 388.0, "epoch": 11.30735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.025075891986489296, "kl": 0.009410339524038136, "learning_rate": 4.731912578165513e-07, "loss": 9.375237277708948e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 448.0625, "completions/min_length": 402.0, "epoch": 11.308823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.044800110161304474, "kl": 0.007476152619346976, "learning_rate": 4.7306311026866343e-07, "loss": 7.466335955541581e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 450.875, "completions/min_length": 408.0, "epoch": 11.310294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.012614324688911438, "kl": 0.008191785425879061, "learning_rate": 4.7293496449530945e-07, "loss": 8.182880992535502e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 442.25, "completions/min_length": 409.0, "epoch": 11.311764705882354, "frac_reward_zero_std": 0.0, "grad_norm": 1.569006085395813, "kl": 0.011879387777298689, "learning_rate": 4.728068205049316e-07, "loss": 0.00011888891458511353, "reward": 0.5171874761581421, "reward_std": 0.18816223740577698, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.704154372215271, "step": 7692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 453.5625, "completions/min_length": 354.0, "epoch": 11.313235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.009555668570101261, "kl": 0.008631429984234273, "learning_rate": 4.726786783059716e-07, "loss": 8.620516018709168e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 7693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 436.6875, "completions/min_length": 407.0, "epoch": 11.314705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.06284338980913162, "kl": 0.008511808817274868, "learning_rate": 4.725505379068711e-07, "loss": 8.514714136254042e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 491.1875, "completions/min_length": 445.0, "epoch": 11.316176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.1528810262680054, "kl": 0.008816473884508014, "learning_rate": 4.7242239931607173e-07, "loss": 8.825957775115967e-05, "reward": 0.46250003576278687, "reward_std": 0.42211851477622986, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 420.75, "completions/min_length": 390.0, "epoch": 11.31764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.009953412227332592, "kl": 0.0073653231374919415, "learning_rate": 4.72294262542015e-07, "loss": 7.374417327810079e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 452.375, "completions/min_length": 380.0, "epoch": 11.319117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.2680354118347168, "kl": 0.010282668401487172, "learning_rate": 4.72166127593142e-07, "loss": 0.00010207295417785645, "reward": 0.8562500476837158, "reward_std": 0.05988579988479614, "rewards/DrugCombAccuracyCOTORM/mean": 0.84375, "rewards/DrugCombAccuracyCOTORM/std": 0.18726837635040283, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 487.75, "completions/min_length": 440.0, "epoch": 11.320588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.011539610102772713, "kl": 0.008264360600151122, "learning_rate": 4.720379944778941e-07, "loss": 8.295165025629103e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/mean_length": 542.8125, "completions/min_length": 450.0, "epoch": 11.322058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.0631866455078125, "kl": 0.009724401170387864, "learning_rate": 4.719098632047124e-07, "loss": 9.791553020477295e-05, "reward": 0.9468749761581421, "reward_std": 0.0733194574713707, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.13437095284461975, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.06718549132347107, "step": 7699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 453.75, "completions/min_length": 347.0, "epoch": 11.323529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.027127359062433243, "kl": 0.009547963622026145, "learning_rate": 4.7178173378203777e-07, "loss": 9.668603161117062e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/mean_length": 479.5625, "completions/min_length": 418.0, "epoch": 11.325, "frac_reward_zero_std": 0.5, "grad_norm": 0.92844158411026, "kl": 0.011732972459867597, "learning_rate": 4.716536062183111e-07, "loss": 0.00011766688839998096, "reward": 0.9666666984558105, "reward_std": 0.061721328645944595, "rewards/DrugCombAccuracyCOTORM/mean": 0.9583333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.11385500431060791, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 425.875, "completions/min_length": 372.0, "epoch": 11.326470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.062142208218574524, "kl": 0.008746277540922165, "learning_rate": 4.7152548052197326e-07, "loss": 8.788093691691756e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 413.8125, "completions/min_length": 336.0, "epoch": 11.327941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.014202912338078022, "kl": 0.008279932313598692, "learning_rate": 4.713973567014646e-07, "loss": 8.273401181213558e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/mean_length": 529.9375, "completions/min_length": 459.0, "epoch": 11.329411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.5923510789871216, "kl": 0.011522804386913776, "learning_rate": 4.712692347652257e-07, "loss": 0.0001156143844127655, "reward": 0.8767499923706055, "reward_std": 0.17010116577148438, "rewards/DrugCombAccuracyCOTORM/mean": 0.8537499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.31442803144454956, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.13437095284461975, "step": 7704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 462.0625, "completions/min_length": 418.0, "epoch": 11.330882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.01293142605572939, "kl": 0.007846077322028577, "learning_rate": 4.711411147216969e-07, "loss": 7.748229836579412e-05, "reward": 0.6713333129882812, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6100000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.40279027819633484, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 7705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 422.25, "completions/min_length": 382.0, "epoch": 11.33235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.011571639217436314, "kl": 0.009322223719209433, "learning_rate": 4.710129965793184e-07, "loss": 9.331450564786792e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 462.1875, "completions/min_length": 418.0, "epoch": 11.333823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.0966805219650269, "kl": 0.009575691539794207, "learning_rate": 4.7088488034653035e-07, "loss": 9.60230827331543e-05, "reward": 0.6499999761581421, "reward_std": 0.220389261841774, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 7707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/mean_length": 489.6875, "completions/min_length": 382.0, "epoch": 11.33529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.0851815938949585, "kl": 0.009409411926753819, "learning_rate": 4.7075676603177277e-07, "loss": 9.445594332646579e-05, "reward": 0.7945833206176758, "reward_std": 0.17138423025608063, "rewards/DrugCombAccuracyCOTORM/mean": 0.7562500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.3742956519126892, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17786456644535065, "step": 7708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 444.3125, "completions/min_length": 400.0, "epoch": 11.336764705882352, "frac_reward_zero_std": 0.5, "grad_norm": 1.3285281658172607, "kl": 0.017039771657437086, "learning_rate": 4.706286536434854e-07, "loss": 0.00016843527555465698, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 425.5625, "completions/min_length": 385.0, "epoch": 11.338235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01277129352092743, "kl": 0.009998946217820048, "learning_rate": 4.7050054319010796e-07, "loss": 0.00010031360579887405, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 445.125, "completions/min_length": 402.0, "epoch": 11.339705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.7979010343551636, "kl": 0.008132770308293402, "learning_rate": 4.703724346800801e-07, "loss": 8.174113463610411e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 469.875, "completions/min_length": 427.0, "epoch": 11.341176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.017439603805542, "kl": 0.008885565446689725, "learning_rate": 4.7024432812184123e-07, "loss": 8.854288171278313e-05, "reward": 0.59375, "reward_std": 0.00862581841647625, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.13437095284461975, "step": 7712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/mean_length": 504.875, "completions/min_length": 404.0, "epoch": 11.342647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.7269403338432312, "kl": 0.009363661985844374, "learning_rate": 4.7011622352383073e-07, "loss": 9.32519615162164e-05, "reward": 0.7615333199501038, "reward_std": 0.15205246210098267, "rewards/DrugCombAccuracyCOTORM/mean": 0.7102500200271606, "rewards/DrugCombAccuracyCOTORM/std": 0.39165589213371277, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9333333373069763, "rewards/DrugCombCoverageCOTORM/std": 0.08944272994995117, "step": 7713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 463.1875, "completions/min_length": 386.0, "epoch": 11.344117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.9926319718360901, "kl": 0.00908718432765454, "learning_rate": 4.699881208944879e-07, "loss": 9.219116327585652e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 7714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/mean_length": 534.0625, "completions/min_length": 391.0, "epoch": 11.345588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9133484959602356, "kl": 0.008489838102832437, "learning_rate": 4.6986002024225165e-07, "loss": 8.474155038129538e-05, "reward": 0.8374999761581421, "reward_std": 0.13503378629684448, "rewards/DrugCombAccuracyCOTORM/mean": 0.8020833730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.30561867356300354, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.1666666567325592, "step": 7715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 476.5625, "completions/min_length": 394.0, "epoch": 11.347058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.971104085445404, "kl": 0.008546594297513366, "learning_rate": 4.6973192157556094e-07, "loss": 8.522865391569212e-05, "reward": 0.9645833373069763, "reward_std": 0.065730020403862, "rewards/DrugCombAccuracyCOTORM/mean": 0.9583333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.11385500431060791, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 7716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 478.0, "completions/min_length": 411.0, "epoch": 11.348529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.0960981845855713, "kl": 0.012933527817949653, "learning_rate": 4.6960382490285467e-07, "loss": 0.00012990087270736694, "reward": 0.960812509059906, "reward_std": 0.11083897948265076, "rewards/DrugCombAccuracyCOTORM/mean": 0.9529687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.18812499940395355, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.0625, "step": 7717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 471.5625, "completions/min_length": 432.0, "epoch": 11.35, "frac_reward_zero_std": 1.0, "grad_norm": 0.026607122272253036, "kl": 0.009411183884367347, "learning_rate": 4.694757302325715e-07, "loss": 9.478422725806013e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 439.1875, "completions/min_length": 355.0, "epoch": 11.351470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.03291289135813713, "kl": 0.010236791567876935, "learning_rate": 4.6934763757314993e-07, "loss": 0.00010139689402421936, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 479.6875, "completions/min_length": 436.0, "epoch": 11.352941176470589, "frac_reward_zero_std": 0.0, "grad_norm": 1.5497097969055176, "kl": 0.011246660025790334, "learning_rate": 4.6921954693302856e-07, "loss": 0.00011227652430534363, "reward": 0.6500000357627869, "reward_std": 0.4208287000656128, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/mean_length": 554.0, "completions/min_length": 439.0, "epoch": 11.354411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.0763987302780151, "kl": 0.013392237713560462, "learning_rate": 4.6909145832064546e-07, "loss": 0.00013740130816586316, "reward": 0.8972083330154419, "reward_std": 0.04572759196162224, "rewards/DrugCombAccuracyCOTORM/mean": 0.8806250095367432, "rewards/DrugCombAccuracyCOTORM/std": 0.14356425404548645, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9270833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.08539126813411713, "step": 7721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/mean_length": 522.5625, "completions/min_length": 446.0, "epoch": 11.355882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 1.351954460144043, "kl": 0.010129753150977194, "learning_rate": 4.689633717444388e-07, "loss": 0.0001010894775390625, "reward": 0.6988750100135803, "reward_std": 0.33979296684265137, "rewards/DrugCombAccuracyCOTORM/mean": 0.6353124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.48780280351638794, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.2719528079032898, "step": 7722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 440.8125, "completions/min_length": 381.0, "epoch": 11.35735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.03435239940881729, "kl": 0.011574001051485538, "learning_rate": 4.688352872128467e-07, "loss": 0.00011450429155956954, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 426.75, "completions/min_length": 358.0, "epoch": 11.358823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.05240156129002571, "kl": 0.012080458924174309, "learning_rate": 4.6870720473430704e-07, "loss": 0.000121359305921942, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 480.5, "completions/min_length": 418.0, "epoch": 11.360294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 1.1788545846939087, "kl": 0.013760719681158662, "learning_rate": 4.6857912431725754e-07, "loss": 0.00013921062054578215, "reward": 0.7342303991317749, "reward_std": 0.22258123755455017, "rewards/DrugCombAccuracyCOTORM/mean": 0.699874997138977, "rewards/DrugCombAccuracyCOTORM/std": 0.4620248079299927, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7433035373687744, "rewards/DrugCombCoverageCOTORM/std": 0.6090612411499023, "step": 7725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 457.9375, "completions/min_length": 407.0, "epoch": 11.361764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.013745714910328388, "kl": 0.007571909227408469, "learning_rate": 4.684510459701359e-07, "loss": 7.581076351925731e-05, "reward": 0.6410000324249268, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5824999809265137, "rewards/DrugCombAccuracyCOTORM/std": 0.43119215965270996, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 7726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 505.75, "completions/min_length": 449.0, "epoch": 11.363235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.4675180912017822, "kl": 0.010957022896036506, "learning_rate": 4.683229697013794e-07, "loss": 0.00010909140110015869, "reward": 0.8500000238418579, "reward_std": 0.3265853524208069, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 477.8125, "completions/min_length": 409.0, "epoch": 11.364705882352942, "frac_reward_zero_std": 0.0, "grad_norm": 1.3970119953155518, "kl": 0.012551259016618133, "learning_rate": 4.6819489551942547e-07, "loss": 0.00012432783842086792, "reward": 0.8884166479110718, "reward_std": 0.27997833490371704, "rewards/DrugCombAccuracyCOTORM/mean": 0.8683333396911621, "rewards/DrugCombAccuracyCOTORM/std": 0.31780266761779785, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 440.9375, "completions/min_length": 390.0, "epoch": 11.366176470588234, "frac_reward_zero_std": 0.0, "grad_norm": 1.635861873626709, "kl": 0.013139789691194892, "learning_rate": 4.680668234327114e-07, "loss": 0.00013091415166854858, "reward": 0.5625, "reward_std": 0.46579426527023315, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 7729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 493.3125, "completions/min_length": 441.0, "epoch": 11.367647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.2947916984558105, "kl": 0.009340017335489392, "learning_rate": 4.6793875344967414e-07, "loss": 9.364930883748457e-05, "reward": 0.7104166746139526, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.3095695972442627, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6041666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.36955931782722473, "step": 7730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 434.1875, "completions/min_length": 345.0, "epoch": 11.369117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.01574760489165783, "kl": 0.009882151149213314, "learning_rate": 4.678106855787506e-07, "loss": 9.769823373062536e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 482.3125, "completions/min_length": 416.0, "epoch": 11.370588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.012785231694579124, "kl": 0.0094742028741166, "learning_rate": 4.676826198283779e-07, "loss": 9.376261732541025e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 468.6875, "completions/min_length": 370.0, "epoch": 11.37205882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.8700634241104126, "kl": 0.008992381393909454, "learning_rate": 4.675545562069922e-07, "loss": 9.000566933536902e-05, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 7733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 475.875, "completions/min_length": 396.0, "epoch": 11.373529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.02966608665883541, "kl": 0.0100067452294752, "learning_rate": 4.6742649472303027e-07, "loss": 0.00010219159594271332, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/mean_length": 478.4375, "completions/min_length": 394.0, "epoch": 11.375, "frac_reward_zero_std": 0.5, "grad_norm": 1.0698381662368774, "kl": 0.01085750199854374, "learning_rate": 4.672984353849284e-07, "loss": 0.00010842084884643555, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 434.875, "completions/min_length": 400.0, "epoch": 11.376470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.9408471584320068, "kl": 0.013139422400854528, "learning_rate": 4.6717037820112285e-07, "loss": 0.0001313909888267517, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 7736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 458.9375, "completions/min_length": 400.0, "epoch": 11.37794117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.0403449535369873, "kl": 0.013684209319762886, "learning_rate": 4.6704232318004976e-07, "loss": 0.00013761594891548157, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 7737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/mean_length": 410.625, "completions/min_length": 350.0, "epoch": 11.379411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.012554945424199104, "kl": 0.006606613285839558, "learning_rate": 4.669142703301451e-07, "loss": 6.596418097615242e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 7738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 430.1875, "completions/min_length": 366.0, "epoch": 11.380882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.01314516644924879, "kl": 0.00889901292975992, "learning_rate": 4.6678621965984444e-07, "loss": 8.849897858453915e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 474.0625, "completions/min_length": 432.0, "epoch": 11.382352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.772127091884613, "kl": 0.008441898506134748, "learning_rate": 4.6665817117758353e-07, "loss": 8.36970575619489e-05, "reward": 0.7990833520889282, "reward_std": 0.07884903252124786, "rewards/DrugCombAccuracyCOTORM/mean": 0.7775000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.26973649859428406, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7708333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.24247947335243225, "step": 7740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 418.6875, "completions/min_length": 342.0, "epoch": 11.383823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.012062141671776772, "kl": 0.009875706164166331, "learning_rate": 4.6653012489179807e-07, "loss": 9.894937102217227e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 446.0625, "completions/min_length": 387.0, "epoch": 11.385294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.010976715013384819, "kl": 0.008369301445782185, "learning_rate": 4.6640208081092316e-07, "loss": 8.39753047330305e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 484.5, "completions/min_length": 428.0, "epoch": 11.386764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9331767559051514, "kl": 0.01088717021048069, "learning_rate": 4.6627403894339416e-07, "loss": 0.00010740011930465698, "reward": 0.6875, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 7743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 416.4375, "completions/min_length": 367.0, "epoch": 11.388235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01824776455760002, "kl": 0.009342132951132953, "learning_rate": 4.6614599929764623e-07, "loss": 9.391827188665047e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/mean_length": 466.0, "completions/min_length": 321.0, "epoch": 11.389705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 0.883328914642334, "kl": 0.009129007463343441, "learning_rate": 4.660179618821142e-07, "loss": 9.036388655658811e-05, "reward": 0.9000000357627869, "reward_std": 0.12121832370758057, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.24397502839565277, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 469.9375, "completions/min_length": 426.0, "epoch": 11.391176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.027554988861084, "kl": 0.00931002153083682, "learning_rate": 4.658899267052327e-07, "loss": 9.277090430259705e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 461.0, "completions/min_length": 357.0, "epoch": 11.39264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0077853202819824, "kl": 0.013443608302623034, "learning_rate": 4.657618937754366e-07, "loss": 0.00013311952352523804, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 7747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 489.3125, "completions/min_length": 434.0, "epoch": 11.394117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 1.2200782299041748, "kl": 0.010680807754397392, "learning_rate": 4.656338631011602e-07, "loss": 0.00010704994201660156, "reward": 0.59375, "reward_std": 0.3005203902721405, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 442.625, "completions/min_length": 409.0, "epoch": 11.395588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.8723426461219788, "kl": 0.008649527444504201, "learning_rate": 4.65505834690838e-07, "loss": 8.592547237640247e-05, "reward": 0.75, "reward_std": 0.20701967179775238, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 517.125, "completions/min_length": 410.0, "epoch": 11.397058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.042753588408231735, "kl": 0.008060187683440745, "learning_rate": 4.653778085529042e-07, "loss": 8.003506809473038e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 7750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/mean_length": 542.625, "completions/min_length": 429.0, "epoch": 11.398529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.2594581842422485, "kl": 0.011278528720140457, "learning_rate": 4.6524978469579264e-07, "loss": 0.00011770211131079122, "reward": 0.6530357003211975, "reward_std": 0.10222593694925308, "rewards/DrugCombAccuracyCOTORM/mean": 0.6001487970352173, "rewards/DrugCombAccuracyCOTORM/std": 0.44614794850349426, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7291666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.30352863669395447, "step": 7751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/mean_length": 478.8125, "completions/min_length": 389.0, "epoch": 11.4, "frac_reward_zero_std": 0.5, "grad_norm": 0.8152220249176025, "kl": 0.00786569679621607, "learning_rate": 4.6512176312793735e-07, "loss": 7.862597703933716e-05, "reward": 0.83645099401474, "reward_std": 0.10160154849290848, "rewards/DrugCombAccuracyCOTORM/mean": 0.8143137097358704, "rewards/DrugCombAccuracyCOTORM/std": 0.25862404704093933, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8500000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.1549193412065506, "step": 7752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 453.0625, "completions/min_length": 366.0, "epoch": 11.401470588235295, "frac_reward_zero_std": 0.0, "grad_norm": 1.5957685708999634, "kl": 0.011662575881928205, "learning_rate": 4.6499374385777196e-07, "loss": 0.00011764466762542725, "reward": 0.84375, "reward_std": 0.3442630469799042, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 493.5, "completions/min_length": 439.0, "epoch": 11.402941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.3422820568084717, "kl": 0.008689154637977481, "learning_rate": 4.648657268937302e-07, "loss": 8.727610111236572e-05, "reward": 0.6937500238418579, "reward_std": 0.20284168422222137, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 479.75, "completions/min_length": 421.0, "epoch": 11.404411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.021848587319254875, "kl": 0.013517663581296802, "learning_rate": 4.647377122442455e-07, "loss": 0.0001359245798084885, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/mean_length": 507.0625, "completions/min_length": 385.0, "epoch": 11.405882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.8622040152549744, "kl": 0.011776694562286139, "learning_rate": 4.646096999177511e-07, "loss": 0.00011458706285338849, "reward": 0.718583345413208, "reward_std": 0.20846574008464813, "rewards/DrugCombAccuracyCOTORM/mean": 0.67166668176651, "rewards/DrugCombAccuracyCOTORM/std": 0.4718286097049713, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 7756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 432.0625, "completions/min_length": 401.0, "epoch": 11.407352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.011165411211550236, "kl": 0.008984684362076223, "learning_rate": 4.6448168992268e-07, "loss": 8.931833144742996e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 414.5625, "completions/min_length": 357.0, "epoch": 11.408823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.014276179485023022, "kl": 0.009717706125229597, "learning_rate": 4.643536822674653e-07, "loss": 9.701203089207411e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 470.0, "completions/min_length": 411.0, "epoch": 11.410294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.1004738807678223, "kl": 0.009663276723586023, "learning_rate": 4.642256769605398e-07, "loss": 9.778775711311027e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 459.1875, "completions/min_length": 392.0, "epoch": 11.411764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.008017762564122677, "kl": 0.006639202823862433, "learning_rate": 4.640976740103362e-07, "loss": 6.62883321638219e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 506.8125, "completions/min_length": 401.0, "epoch": 11.413235294117648, "frac_reward_zero_std": 0.5, "grad_norm": 0.7354378700256348, "kl": 0.0073595845606178045, "learning_rate": 4.639696734252869e-07, "loss": 7.34180212020874e-05, "reward": 0.8250476121902466, "reward_std": 0.0706915631890297, "rewards/DrugCombAccuracyCOTORM/mean": 0.7835416793823242, "rewards/DrugCombAccuracyCOTORM/std": 0.2536415457725525, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9821428656578064, "rewards/DrugCombCoverageCOTORM/std": 0.0714285671710968, "step": 7761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 486.5, "completions/min_length": 407.0, "epoch": 11.41470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.011657613329589367, "kl": 0.007987112971022725, "learning_rate": 4.638416752138245e-07, "loss": 8.084499859251082e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 426.25, "completions/min_length": 394.0, "epoch": 11.416176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.1295421123504639, "kl": 0.011511666001752019, "learning_rate": 4.637136793843809e-07, "loss": 0.00011529028415679932, "reward": 0.7873333692550659, "reward_std": 0.20299598574638367, "rewards/DrugCombAccuracyCOTORM/mean": 0.73416668176651, "rewards/DrugCombAccuracyCOTORM/std": 0.4422292113304138, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 440.1875, "completions/min_length": 373.0, "epoch": 11.41764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.02757415734231472, "kl": 0.009184653405100107, "learning_rate": 4.6358568594538835e-07, "loss": 9.295744530390948e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 459.0, "completions/min_length": 356.0, "epoch": 11.419117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.892864465713501, "kl": 0.010231415624730289, "learning_rate": 4.634576949052786e-07, "loss": 0.00010273297084495425, "reward": 0.6538333296775818, "reward_std": 0.14982688426971436, "rewards/DrugCombAccuracyCOTORM/mean": 0.6037499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.4699627757072449, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7083333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.6763190627098083, "step": 7765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 469.4375, "completions/min_length": 404.0, "epoch": 11.420588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.011127634905278683, "kl": 0.007886538514867425, "learning_rate": 4.633297062724834e-07, "loss": 7.810856914147735e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 488.9375, "completions/min_length": 412.0, "epoch": 11.422058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.6786829829216003, "kl": 0.00731785676907748, "learning_rate": 4.632017200554345e-07, "loss": 7.341389573412016e-05, "reward": 0.9156249761581421, "reward_std": 0.1164485514163971, "rewards/DrugCombAccuracyCOTORM/mean": 0.90625, "rewards/DrugCombAccuracyCOTORM/std": 0.20155644416809082, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.20155644416809082, "step": 7767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 444.5625, "completions/min_length": 388.0, "epoch": 11.423529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.008545373566448689, "kl": 0.007653985521756113, "learning_rate": 4.6307373626256304e-07, "loss": 7.672292122151703e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 7768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 463.125, "completions/min_length": 405.0, "epoch": 11.425, "frac_reward_zero_std": 0.5, "grad_norm": 1.386467456817627, "kl": 0.01103387784678489, "learning_rate": 4.6294575490230037e-07, "loss": 0.00010998204379575327, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 7769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 453.75, "completions/min_length": 406.0, "epoch": 11.426470588235293, "frac_reward_zero_std": 0.0, "grad_norm": 1.3823261260986328, "kl": 0.012009725207462907, "learning_rate": 4.628177759830776e-07, "loss": 0.00012198090553283691, "reward": 0.7749999761581421, "reward_std": 0.4200340211391449, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 7770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 438.625, "completions/min_length": 399.0, "epoch": 11.427941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.8591569662094116, "kl": 0.008540412760339677, "learning_rate": 4.6268979951332566e-07, "loss": 8.544413140043616e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 7771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 428.8125, "completions/min_length": 376.0, "epoch": 11.429411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 1.4228103160858154, "kl": 0.017580684507265687, "learning_rate": 4.625618255014753e-07, "loss": 0.00017390400171279907, "reward": 0.5874999761581421, "reward_std": 0.3934735357761383, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 7772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/mean_length": 508.875, "completions/min_length": 409.0, "epoch": 11.430882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.7507161498069763, "kl": 0.008877420448698103, "learning_rate": 4.6243385395595725e-07, "loss": 8.894503116607666e-05, "reward": 0.9391250014305115, "reward_std": 0.12156426906585693, "rewards/DrugCombAccuracyCOTORM/mean": 0.9304167032241821, "rewards/DrugCombAccuracyCOTORM/std": 0.2068883180618286, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9479166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.145535409450531, "step": 7773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 475.125, "completions/min_length": 443.0, "epoch": 11.43235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0305283069610596, "kl": 0.012128986534662545, "learning_rate": 4.6230588488520175e-07, "loss": 0.00012171853450126946, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/mean_length": 438.75, "completions/min_length": 352.0, "epoch": 11.433823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.117069125175476, "kl": 0.010394112090580165, "learning_rate": 4.621779182976391e-07, "loss": 0.00010497562470845878, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 460.625, "completions/min_length": 405.0, "epoch": 11.435294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.8535156846046448, "kl": 0.009166684001684189, "learning_rate": 4.620499542016995e-07, "loss": 9.151291305897757e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 438.1875, "completions/min_length": 371.0, "epoch": 11.436764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.014149477705359459, "kl": 0.00822592806071043, "learning_rate": 4.619219926058129e-07, "loss": 8.301538764499128e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 454.5, "completions/min_length": 381.0, "epoch": 11.438235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.023924635723233223, "kl": 0.008837194414809346, "learning_rate": 4.6179403351840907e-07, "loss": 8.891039760783315e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 464.25, "completions/min_length": 382.0, "epoch": 11.439705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9165449738502502, "kl": 0.014203603728674352, "learning_rate": 4.616660769479177e-07, "loss": 0.00014189258217811584, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 447.125, "completions/min_length": 375.0, "epoch": 11.441176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.025112709030508995, "kl": 0.008650746196508408, "learning_rate": 4.615381229027681e-07, "loss": 8.737549796933308e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 422.3125, "completions/min_length": 383.0, "epoch": 11.44264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.009263223968446255, "kl": 0.006400709040462971, "learning_rate": 4.614101713913896e-07, "loss": 6.434653187170625e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 495.3125, "completions/min_length": 445.0, "epoch": 11.444117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.061773657798767, "kl": 0.009993452578783035, "learning_rate": 4.612822224222114e-07, "loss": 9.988993406295776e-05, "reward": 0.9416458606719971, "reward_std": 0.08156458288431168, "rewards/DrugCombAccuracyCOTORM/mean": 0.9296614527702332, "rewards/DrugCombAccuracyCOTORM/std": 0.15351733565330505, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.05692751333117485, "step": 7782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/mean_length": 567.375, "completions/min_length": 485.0, "epoch": 11.445588235294117, "frac_reward_zero_std": 0.0, "grad_norm": 1.297461748123169, "kl": 0.0099164426792413, "learning_rate": 4.611542760036623e-07, "loss": 9.856373071670532e-05, "reward": 0.6130427718162537, "reward_std": 0.08617256581783295, "rewards/DrugCombAccuracyCOTORM/mean": 0.5449492931365967, "rewards/DrugCombAccuracyCOTORM/std": 0.43866559863090515, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7708333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.25059187412261963, "step": 7783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 437.1875, "completions/min_length": 364.0, "epoch": 11.447058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.9491410851478577, "kl": 0.010817334754392505, "learning_rate": 4.6102633214417123e-07, "loss": 0.00010792378452606499, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 490.75, "completions/min_length": 417.0, "epoch": 11.448529411764707, "frac_reward_zero_std": 0.0, "grad_norm": 1.9572261571884155, "kl": 0.011841397965326905, "learning_rate": 4.608983908521669e-07, "loss": 0.00011958181858062744, "reward": 0.4593958556652069, "reward_std": 0.22516405582427979, "rewards/DrugCombAccuracyCOTORM/mean": 0.34507814049720764, "rewards/DrugCombAccuracyCOTORM/std": 0.3541584014892578, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.18257418274879456, "step": 7785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 418.625, "completions/min_length": 384.0, "epoch": 11.45, "frac_reward_zero_std": 1.0, "grad_norm": 0.019586551934480667, "kl": 0.009458732441999018, "learning_rate": 4.6077045213607755e-07, "loss": 9.572983253747225e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 477.25, "completions/min_length": 424.0, "epoch": 11.451470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.0583305358886719, "kl": 0.011149780359119177, "learning_rate": 4.6064251600433146e-07, "loss": 0.00011140144488308579, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 474.25, "completions/min_length": 422.0, "epoch": 11.452941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.0852241516113281, "kl": 0.009414180414751172, "learning_rate": 4.605145824653569e-07, "loss": 9.323688573203981e-05, "reward": 0.9551249742507935, "reward_std": 0.12692566215991974, "rewards/DrugCombAccuracyCOTORM/mean": 0.9478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.20874999463558197, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 7788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 455.6875, "completions/min_length": 410.0, "epoch": 11.454411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 1.5272797346115112, "kl": 0.010692982119508088, "learning_rate": 4.6038665152758165e-07, "loss": 0.0001074075698852539, "reward": 0.84375, "reward_std": 0.3442630469799042, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 494.875, "completions/min_length": 417.0, "epoch": 11.455882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.8816364407539368, "kl": 0.008121205843053758, "learning_rate": 4.602587231994336e-07, "loss": 8.109211921691895e-05, "reward": 0.7092499732971191, "reward_std": 0.1821122020483017, "rewards/DrugCombAccuracyCOTORM/mean": 0.6404687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.48291152715682983, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.08539126068353653, "step": 7790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 451.0625, "completions/min_length": 389.0, "epoch": 11.45735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9375247359275818, "kl": 0.008840735303238034, "learning_rate": 4.6013079748934037e-07, "loss": 8.815526962280273e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 7791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/mean_length": 487.0, "completions/min_length": 367.0, "epoch": 11.458823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.9127241969108582, "kl": 0.00816604180727154, "learning_rate": 4.600028744057292e-07, "loss": 8.115172386169434e-05, "reward": 0.918749988079071, "reward_std": 0.08972509950399399, "rewards/DrugCombAccuracyCOTORM/mean": 0.90625, "rewards/DrugCombAccuracyCOTORM/std": 0.1717960685491562, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 448.375, "completions/min_length": 409.0, "epoch": 11.46029411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.0143988486379385, "kl": 0.008156097726896405, "learning_rate": 4.5987495395702745e-07, "loss": 8.128411718644202e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/mean_length": 536.75, "completions/min_length": 455.0, "epoch": 11.461764705882352, "frac_reward_zero_std": 0.0, "grad_norm": 1.2499860525131226, "kl": 0.008941641659475863, "learning_rate": 4.5974703615166217e-07, "loss": 8.963793516159058e-05, "reward": 0.30000001192092896, "reward_std": 0.2787647545337677, "rewards/DrugCombAccuracyCOTORM/mean": 0.21875, "rewards/DrugCombAccuracyCOTORM/std": 0.3145764470100403, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.25, "rewards/DrugCombCoverageCOTORM/std": 1.0, "step": 7794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 396.0625, "completions/min_length": 338.0, "epoch": 11.463235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1565934419631958, "kl": 0.034373369766399264, "learning_rate": 4.596191209980603e-07, "loss": 0.00033821165561676025, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 444.9375, "completions/min_length": 388.0, "epoch": 11.464705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.1784982681274414, "kl": 0.010015500127337873, "learning_rate": 4.5949120850464855e-07, "loss": 9.822845458984375e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 523.4375, "completions/min_length": 459.0, "epoch": 11.466176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.117768406867981, "kl": 0.010745860869064927, "learning_rate": 4.593632986798536e-07, "loss": 0.00010813400149345398, "reward": 0.5954999923706055, "reward_std": 0.2662253975868225, "rewards/DrugCombAccuracyCOTORM/mean": 0.5204166769981384, "rewards/DrugCombAccuracyCOTORM/std": 0.46592622995376587, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.197202667593956, "step": 7797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 447.1875, "completions/min_length": 405.0, "epoch": 11.467647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.2915953397750854, "kl": 0.013483922462910414, "learning_rate": 4.592353915321016e-07, "loss": 0.00013473308354150504, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 418.8125, "completions/min_length": 283.0, "epoch": 11.469117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.010261867195367813, "kl": 0.005902185454033315, "learning_rate": 4.5910748706981873e-07, "loss": 5.9955673350486904e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 416.625, "completions/min_length": 335.0, "epoch": 11.470588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.8483563661575317, "kl": 0.008240849361754954, "learning_rate": 4.589795853014312e-07, "loss": 8.258213347289711e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/mean_length": 462.0625, "completions/min_length": 344.0, "epoch": 11.472058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.304740071296692, "kl": 0.013447057688608766, "learning_rate": 4.588516862353648e-07, "loss": 0.00013355165719985962, "reward": 0.6447916626930237, "reward_std": 0.3299272656440735, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9479166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.11735905706882477, "step": 7801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/mean_length": 413.25, "completions/min_length": 377.0, "epoch": 11.473529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.08851980417966843, "kl": 0.011042711790651083, "learning_rate": 4.587237898800451e-07, "loss": 0.00011107644240837544, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 443.1875, "completions/min_length": 394.0, "epoch": 11.475, "frac_reward_zero_std": 1.0, "grad_norm": 0.014276501722633839, "kl": 0.008641707361675799, "learning_rate": 4.585958962438979e-07, "loss": 8.51120930747129e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 411.375, "completions/min_length": 355.0, "epoch": 11.476470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.016013994812965393, "kl": 0.007117023691534996, "learning_rate": 4.584680053353481e-07, "loss": 6.939157901797444e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 422.5625, "completions/min_length": 375.0, "epoch": 11.477941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 1.1433320045471191, "kl": 0.013493834994733334, "learning_rate": 4.58340117162821e-07, "loss": 0.0001352809340460226, "reward": 0.5874999761581421, "reward_std": 0.0353553369641304, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 450.5625, "completions/min_length": 385.0, "epoch": 11.479411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.012131442315876484, "kl": 0.008602240588515997, "learning_rate": 4.582122317347415e-07, "loss": 8.671084651723504e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/mean_length": 523.5625, "completions/min_length": 436.0, "epoch": 11.480882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 1.4647573232650757, "kl": 0.010495822527445853, "learning_rate": 4.580843490595345e-07, "loss": 0.00010425224900245667, "reward": 0.40416666865348816, "reward_std": 0.42496436834335327, "rewards/DrugCombAccuracyCOTORM/mean": 0.3333333432674408, "rewards/DrugCombAccuracyCOTORM/std": 0.45542004704475403, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 7807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 452.25, "completions/min_length": 406.0, "epoch": 11.48235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.561823844909668, "kl": 0.01218707486987114, "learning_rate": 4.5795646914562445e-07, "loss": 0.0001204460859298706, "reward": 0.6625000238418579, "reward_std": 0.4488446116447449, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 450.3125, "completions/min_length": 417.0, "epoch": 11.483823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.8842896819114685, "kl": 0.010547130834311247, "learning_rate": 4.57828592001436e-07, "loss": 0.00010532644228078425, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 446.625, "completions/min_length": 401.0, "epoch": 11.485294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.0343530997633934, "kl": 0.012438557809218764, "learning_rate": 4.57700717635393e-07, "loss": 0.00012467405758798122, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 410.9375, "completions/min_length": 333.0, "epoch": 11.486764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.020837271586060524, "kl": 0.010902636568062007, "learning_rate": 4.5757284605591973e-07, "loss": 0.00010894534352701157, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 448.875, "completions/min_length": 378.0, "epoch": 11.488235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.04307086020708084, "kl": 0.010681078652851284, "learning_rate": 4.574449772714399e-07, "loss": 0.00010649630712578073, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 429.0625, "completions/min_length": 394.0, "epoch": 11.489705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.10653189569711685, "kl": 0.010165079147554934, "learning_rate": 4.573171112903774e-07, "loss": 0.00010295987885911018, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 501.0625, "completions/min_length": 458.0, "epoch": 11.491176470588234, "frac_reward_zero_std": 0.5, "grad_norm": 0.8884725570678711, "kl": 0.008144291467033327, "learning_rate": 4.571892481211556e-07, "loss": 8.133798837661743e-05, "reward": 0.656166672706604, "reward_std": 0.04289780929684639, "rewards/DrugCombAccuracyCOTORM/mean": 0.5962499976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.4203629493713379, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.2687419056892395, "step": 7814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/mean_length": 535.375, "completions/min_length": 444.0, "epoch": 11.492647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.9553771018981934, "kl": 0.008745414903387427, "learning_rate": 4.570613877721978e-07, "loss": 8.762294601183385e-05, "reward": 0.7874270677566528, "reward_std": 0.15168850123882294, "rewards/DrugCombAccuracyCOTORM/mean": 0.7391666769981384, "rewards/DrugCombAccuracyCOTORM/std": 0.37071701884269714, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9609375, "rewards/DrugCombCoverageCOTORM/std": 0.059839196503162384, "step": 7815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 409.5, "completions/min_length": 360.0, "epoch": 11.494117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.028883133083581924, "kl": 0.007164181093685329, "learning_rate": 4.56933530251927e-07, "loss": 7.158854714361951e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 459.3125, "completions/min_length": 404.0, "epoch": 11.495588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.7914785146713257, "kl": 0.010406379587948322, "learning_rate": 4.568056755687663e-07, "loss": 0.00010362340253777802, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 416.8125, "completions/min_length": 375.0, "epoch": 11.49705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.4955339431762695, "kl": 0.012969970935955644, "learning_rate": 4.566778237311384e-07, "loss": 0.00012945383787155151, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 419.125, "completions/min_length": 357.0, "epoch": 11.498529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.028523452579975128, "kl": 0.009808563976548612, "learning_rate": 4.5654997474746577e-07, "loss": 9.787065209820867e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/mean_length": 401.1875, "completions/min_length": 336.0, "epoch": 11.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.008753049187362194, "kl": 0.006260383874177933, "learning_rate": 4.5642212862617085e-07, "loss": 6.274622865021229e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 411.5, "completions/min_length": 347.0, "epoch": 11.501470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.014117956161499, "kl": 0.010071578319184482, "learning_rate": 4.5629428537567594e-07, "loss": 9.999843314290047e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 439.8125, "completions/min_length": 389.0, "epoch": 11.50294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.01310442853718996, "kl": 0.006719377124682069, "learning_rate": 4.5616644500440283e-07, "loss": 6.643219967372715e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 471.0625, "completions/min_length": 391.0, "epoch": 11.504411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.423178791999817, "kl": 0.011401788098737597, "learning_rate": 4.560386075207734e-07, "loss": 0.00011620670557022095, "reward": 0.7875000238418579, "reward_std": 0.3837963938713074, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/mean_length": 515.5, "completions/min_length": 405.0, "epoch": 11.505882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 1.1688958406448364, "kl": 0.008822304545901716, "learning_rate": 4.559107729332092e-07, "loss": 8.783489465713501e-05, "reward": 0.7250000238418579, "reward_std": 0.38195645809173584, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 7824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 459.25, "completions/min_length": 402.0, "epoch": 11.507352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.01816112920641899, "kl": 0.008007020922377706, "learning_rate": 4.5578294125013174e-07, "loss": 7.984914554981515e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 432.3125, "completions/min_length": 386.0, "epoch": 11.508823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.01017041690647602, "kl": 0.010258086258545518, "learning_rate": 4.556551124799621e-07, "loss": 0.0001025190285872668, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 449.5625, "completions/min_length": 377.0, "epoch": 11.510294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.022469306364655495, "kl": 0.009056632174178958, "learning_rate": 4.555272866311217e-07, "loss": 9.141024202108383e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 418.8125, "completions/min_length": 388.0, "epoch": 11.511764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.016142746433615685, "kl": 0.00880828546360135, "learning_rate": 4.553994637120309e-07, "loss": 8.769625128479674e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 427.4375, "completions/min_length": 365.0, "epoch": 11.513235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.864090085029602, "kl": 0.008116555167362094, "learning_rate": 4.552716437311106e-07, "loss": 8.158334094332531e-05, "reward": 0.7250000238418579, "reward_std": 0.23145504295825958, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 7829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/mean_length": 493.125, "completions/min_length": 395.0, "epoch": 11.514705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.0373733043670654, "kl": 0.011297130724415183, "learning_rate": 4.5514382669678126e-07, "loss": 0.00011375546455383301, "reward": 0.8479166626930237, "reward_std": 0.20995795726776123, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 7830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 454.875, "completions/min_length": 382.0, "epoch": 11.516176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.8020991086959839, "kl": 0.01384626841172576, "learning_rate": 4.5501601261746303e-07, "loss": 0.0001390964607708156, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 413.8125, "completions/min_length": 346.0, "epoch": 11.51764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8194679021835327, "kl": 0.008045626571401954, "learning_rate": 4.54888201501576e-07, "loss": 8.023253758437932e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 481.3125, "completions/min_length": 429.0, "epoch": 11.519117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.03987995535135269, "kl": 0.011222995817661285, "learning_rate": 4.5476039335754036e-07, "loss": 0.00011368218110874295, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/mean_length": 490.625, "completions/min_length": 418.0, "epoch": 11.520588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9942700266838074, "kl": 0.008955632103607059, "learning_rate": 4.546325881937753e-07, "loss": 8.709356188774109e-05, "reward": 0.875, "reward_std": 0.18322508037090302, "rewards/DrugCombAccuracyCOTORM/mean": 0.84375, "rewards/DrugCombAccuracyCOTORM/std": 0.3520771861076355, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 451.1875, "completions/min_length": 377.0, "epoch": 11.522058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 0.9384830594062805, "kl": 0.009301342535763979, "learning_rate": 4.545047860187005e-07, "loss": 9.302562102675438e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 460.3125, "completions/min_length": 413.0, "epoch": 11.523529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9001336693763733, "kl": 0.009205949725583196, "learning_rate": 4.5437698684073536e-07, "loss": 9.232395677827299e-05, "reward": 0.6687500476837158, "reward_std": 0.2069118171930313, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.6020797491073608, "step": 7836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 479.625, "completions/min_length": 446.0, "epoch": 11.525, "frac_reward_zero_std": 0.5, "grad_norm": 0.9662955403327942, "kl": 0.010318473679944873, "learning_rate": 4.542491906682988e-07, "loss": 0.00010337616549804807, "reward": 0.6687500476837158, "reward_std": 0.20517849922180176, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 7837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 458.4375, "completions/min_length": 413.0, "epoch": 11.526470588235295, "frac_reward_zero_std": 0.0, "grad_norm": 1.5155185461044312, "kl": 0.01668666396290064, "learning_rate": 4.541213975098098e-07, "loss": 0.00016319751739501953, "reward": 0.8500000238418579, "reward_std": 0.3265853524208069, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 397.25, "completions/min_length": 357.0, "epoch": 11.527941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.009157106280326843, "kl": 0.0067018839763477445, "learning_rate": 4.5399360737368707e-07, "loss": 6.677908822894096e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 458.9375, "completions/min_length": 420.0, "epoch": 11.529411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.3526222705841064, "kl": 0.02072112960740924, "learning_rate": 4.5386582026834904e-07, "loss": 0.00020656734704971313, "reward": 0.6802083253860474, "reward_std": 0.27715805172920227, "rewards/DrugCombAccuracyCOTORM/mean": 0.6041666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.37453675270080566, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 7840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 491.1875, "completions/min_length": 436.0, "epoch": 11.530882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.021458206698298454, "kl": 0.012201114790514112, "learning_rate": 4.53738036202214e-07, "loss": 0.00012224460078869015, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/mean_length": 413.6875, "completions/min_length": 382.0, "epoch": 11.532352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.1398478746414185, "kl": 0.00815144449006766, "learning_rate": 4.536102551837e-07, "loss": 8.163763413904235e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 487.3125, "completions/min_length": 428.0, "epoch": 11.533823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.801609456539154, "kl": 0.008985430002212524, "learning_rate": 4.534824772212251e-07, "loss": 8.933351637097076e-05, "reward": 0.8233333826065063, "reward_std": 0.14813123643398285, "rewards/DrugCombAccuracyCOTORM/mean": 0.800000011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.3265986442565918, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 7843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/mean_length": 509.25, "completions/min_length": 429.0, "epoch": 11.535294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9960724711418152, "kl": 0.011216684710234404, "learning_rate": 4.533547023232068e-07, "loss": 0.00011079866817453876, "reward": 0.8802083730697632, "reward_std": 0.10019201785326004, "rewards/DrugCombAccuracyCOTORM/mean": 0.8541666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.22669117152690887, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 7844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 434.875, "completions/min_length": 386.0, "epoch": 11.536764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.011329837143421173, "kl": 0.008732951595447958, "learning_rate": 4.5322693049806275e-07, "loss": 8.769237319938838e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 468.8125, "completions/min_length": 394.0, "epoch": 11.538235294117648, "frac_reward_zero_std": 0.5, "grad_norm": 1.0428144931793213, "kl": 0.013102970784530044, "learning_rate": 4.530991617542101e-07, "loss": 0.00012952834367752075, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 7846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 426.3125, "completions/min_length": 366.0, "epoch": 11.53970588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.013810969889163971, "kl": 0.00873864849563688, "learning_rate": 4.52971396100066e-07, "loss": 8.73371900524944e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 444.1875, "completions/min_length": 377.0, "epoch": 11.541176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.817837119102478, "kl": 0.015212647500447929, "learning_rate": 4.5284363354404727e-07, "loss": 0.00015232691657729447, "reward": 0.9833333492279053, "reward_std": 0.047140445560216904, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 469.0625, "completions/min_length": 405.0, "epoch": 11.54264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.056402869522571564, "kl": 0.010219215066172183, "learning_rate": 4.527158740945706e-07, "loss": 0.00010165949061047286, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 425.1875, "completions/min_length": 372.0, "epoch": 11.544117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.017650291323661804, "kl": 0.009344784310087562, "learning_rate": 4.525881177600525e-07, "loss": 9.315554052591324e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 405.0, "completions/min_length": 349.0, "epoch": 11.545588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.027267375960946083, "kl": 0.009859468671493232, "learning_rate": 4.5246036454890934e-07, "loss": 9.903723548632115e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 431.125, "completions/min_length": 360.0, "epoch": 11.547058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.1288204193115234, "kl": 0.010448514949530363, "learning_rate": 4.5233261446955693e-07, "loss": 0.00010505989484954625, "reward": 0.675000011920929, "reward_std": 0.20528726279735565, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 7852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 459.75, "completions/min_length": 432.0, "epoch": 11.548529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.1175262928009033, "kl": 0.010529785184189677, "learning_rate": 4.522048675304113e-07, "loss": 0.00010522051888983697, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 7853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 445.0625, "completions/min_length": 404.0, "epoch": 11.55, "frac_reward_zero_std": 0.5, "grad_norm": 1.1061431169509888, "kl": 0.01229376276023686, "learning_rate": 4.52077123739888e-07, "loss": 0.00012294156476855278, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/mean_length": 485.0, "completions/min_length": 382.0, "epoch": 11.551470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 0.9086472392082214, "kl": 0.011619491735473275, "learning_rate": 4.5194938310640254e-07, "loss": 0.00011704833741532639, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 432.1875, "completions/min_length": 375.0, "epoch": 11.552941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.009878613986074924, "kl": 0.007758656051009893, "learning_rate": 4.518216456383701e-07, "loss": 7.757663843221962e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 437.6875, "completions/min_length": 359.0, "epoch": 11.554411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.020004017278552055, "kl": 0.009262623265385628, "learning_rate": 4.516939113442058e-07, "loss": 9.27913497434929e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 443.3125, "completions/min_length": 393.0, "epoch": 11.555882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.019229762256145477, "kl": 0.009632645291276276, "learning_rate": 4.515661802323243e-07, "loss": 9.586039959685877e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/mean_length": 389.0625, "completions/min_length": 327.0, "epoch": 11.55735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.011093493551015854, "kl": 0.008132604067213833, "learning_rate": 4.5143845231114026e-07, "loss": 8.154295937856659e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 415.75, "completions/min_length": 358.0, "epoch": 11.558823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.023546135053038597, "kl": 0.006779752438887954, "learning_rate": 4.513107275890681e-07, "loss": 6.742353434674442e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/mean_length": 481.0, "completions/min_length": 385.0, "epoch": 11.560294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.009623045101761818, "kl": 0.008373894728720188, "learning_rate": 4.5118300607452195e-07, "loss": 8.380983490496874e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/mean_length": 482.875, "completions/min_length": 382.0, "epoch": 11.561764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 0.9950714111328125, "kl": 0.008861640584655106, "learning_rate": 4.5105528777591587e-07, "loss": 8.811843872535974e-05, "reward": 0.6875, "reward_std": 0.19329023361206055, "rewards/DrugCombAccuracyCOTORM/mean": 0.6458333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.4629814922809601, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7083333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.6763190627098083, "step": 7862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/mean_length": 506.625, "completions/min_length": 425.0, "epoch": 11.563235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 0.8780146837234497, "kl": 0.01014742348343134, "learning_rate": 4.509275727016637e-07, "loss": 0.00010130460577784106, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 7863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/mean_length": 533.625, "completions/min_length": 430.0, "epoch": 11.564705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.8593301177024841, "kl": 0.010414761491119862, "learning_rate": 4.507998608601787e-07, "loss": 0.00010301917791366577, "reward": 0.75, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.3095695972442627, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 496.125, "completions/min_length": 439.0, "epoch": 11.566176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.9148693084716797, "kl": 0.009920546319335699, "learning_rate": 4.5067215225987434e-07, "loss": 9.979912283597514e-05, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 7865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/mean_length": 528.5, "completions/min_length": 435.0, "epoch": 11.56764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.801674485206604, "kl": 0.01968473056331277, "learning_rate": 4.5054444690916376e-07, "loss": 0.0001955864718183875, "reward": 0.8848960995674133, "reward_std": 0.07942110300064087, "rewards/DrugCombAccuracyCOTORM/mean": 0.8574222326278687, "rewards/DrugCombAccuracyCOTORM/std": 0.20050480961799622, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9895833730697632, "rewards/DrugCombCoverageCOTORM/std": 0.02846374548971653, "step": 7866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/mean_length": 532.0, "completions/min_length": 439.0, "epoch": 11.569117647058823, "frac_reward_zero_std": 0.0, "grad_norm": 1.3010389804840088, "kl": 0.011370000429451466, "learning_rate": 4.504167448164599e-07, "loss": 0.00011388957500457764, "reward": 0.7100995779037476, "reward_std": 0.40311646461486816, "rewards/DrugCombAccuracyCOTORM/mean": 0.66236412525177, "rewards/DrugCombAccuracyCOTORM/std": 0.45475924015045166, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8020833730697632, "rewards/DrugCombCoverageCOTORM/std": 0.35075974464416504, "step": 7867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 480.75, "completions/min_length": 413.0, "epoch": 11.570588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.8109827041625977, "kl": 0.007539020269177854, "learning_rate": 4.5028904599017534e-07, "loss": 7.52293854020536e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 434.125, "completions/min_length": 377.0, "epoch": 11.572058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.015593722462654114, "kl": 0.006366194924339652, "learning_rate": 4.501613504387228e-07, "loss": 6.43673338345252e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 455.0625, "completions/min_length": 412.0, "epoch": 11.573529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.010142889805138111, "kl": 0.007589120417833328, "learning_rate": 4.500336581705143e-07, "loss": 7.50921608414501e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 7870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/mean_length": 479.6875, "completions/min_length": 386.0, "epoch": 11.575, "frac_reward_zero_std": 0.5, "grad_norm": 0.8302127718925476, "kl": 0.010965724824927747, "learning_rate": 4.499059691939619e-07, "loss": 0.000109158456325531, "reward": 0.7405208349227905, "reward_std": 0.13162186741828918, "rewards/DrugCombAccuracyCOTORM/mean": 0.690625011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.38549479842185974, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8802083134651184, "rewards/DrugCombCoverageCOTORM/std": 0.1875, "step": 7871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/mean_length": 494.125, "completions/min_length": 404.0, "epoch": 11.576470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9184487462043762, "kl": 0.010151291498914361, "learning_rate": 4.4977828351747743e-07, "loss": 0.00010176748037338257, "reward": 0.7884637117385864, "reward_std": 0.14798027276992798, "rewards/DrugCombAccuracyCOTORM/mean": 0.7453452348709106, "rewards/DrugCombAccuracyCOTORM/std": 0.3570481240749359, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.921875, "rewards/DrugCombCoverageCOTORM/std": 0.14099103212356567, "step": 7872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 473.0, "completions/min_length": 431.0, "epoch": 11.577941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.011474942788481712, "kl": 0.00852845306508243, "learning_rate": 4.4965060114947264e-07, "loss": 8.552199142286554e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 455.4375, "completions/min_length": 422.0, "epoch": 11.579411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.8187376856803894, "kl": 0.008786491118371487, "learning_rate": 4.495229220983587e-07, "loss": 8.730326226213947e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/mean_length": 417.875, "completions/min_length": 393.0, "epoch": 11.580882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.012638675980269909, "kl": 0.0089492192491889, "learning_rate": 4.4939524637254717e-07, "loss": 8.936096855904907e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 7875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/mean_length": 381.1875, "completions/min_length": 353.0, "epoch": 11.58235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.016902774572372437, "kl": 0.009317750227637589, "learning_rate": 4.4926757398044856e-07, "loss": 9.268509165849537e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 7876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 417.4375, "completions/min_length": 364.0, "epoch": 11.583823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.0966482162475586, "kl": 0.010046449024230242, "learning_rate": 4.4913990493047377e-07, "loss": 0.00010097803897224367, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 439.6875, "completions/min_length": 397.0, "epoch": 11.58529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9339112043380737, "kl": 0.01205107884015888, "learning_rate": 4.4901223923103336e-07, "loss": 0.00012013688683509827, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 428.0625, "completions/min_length": 367.0, "epoch": 11.586764705882352, "frac_reward_zero_std": 1.0, "grad_norm": 0.01292802207171917, "kl": 0.008660935098305345, "learning_rate": 4.4888457689053763e-07, "loss": 8.70901276357472e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 455.375, "completions/min_length": 358.0, "epoch": 11.588235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.020789222791790962, "kl": 0.006768663763068616, "learning_rate": 4.487569179173965e-07, "loss": 6.839165143901482e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/mean_length": 535.125, "completions/min_length": 422.0, "epoch": 11.589705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9447342753410339, "kl": 0.009292601607739925, "learning_rate": 4.4862926232002015e-07, "loss": 9.363889694213867e-05, "reward": 0.7417339086532593, "reward_std": 0.2139759510755539, "rewards/DrugCombAccuracyCOTORM/mean": 0.7325059175491333, "rewards/DrugCombAccuracyCOTORM/std": 0.40294647216796875, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5572916865348816, "rewards/DrugCombCoverageCOTORM/std": 0.799350917339325, "step": 7881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/mean_length": 419.375, "completions/min_length": 362.0, "epoch": 11.591176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.012678763829171658, "kl": 0.009262935491278768, "learning_rate": 4.485016101068179e-07, "loss": 9.307976870331913e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 484.25, "completions/min_length": 429.0, "epoch": 11.592647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.7867419719696045, "kl": 0.011112111154943705, "learning_rate": 4.483739612861992e-07, "loss": 0.00011053681373596191, "reward": 0.9862916469573975, "reward_std": 0.03877301141619682, "rewards/DrugCombAccuracyCOTORM/mean": 0.98416668176651, "rewards/DrugCombAccuracyCOTORM/std": 0.06333333253860474, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9895833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.041666675359010696, "step": 7883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 440.0625, "completions/min_length": 373.0, "epoch": 11.594117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.00877775251865387, "kl": 0.007872960064560175, "learning_rate": 4.4824631586657324e-07, "loss": 7.892082794569433e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 487.0625, "completions/min_length": 405.0, "epoch": 11.595588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9995216131210327, "kl": 0.011268862523138523, "learning_rate": 4.481186738563491e-07, "loss": 0.00011216476559638977, "reward": 0.8636666536331177, "reward_std": 0.19091026484966278, "rewards/DrugCombAccuracyCOTORM/mean": 0.8400000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.3471022844314575, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.19245010614395142, "step": 7885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 474.8125, "completions/min_length": 394.0, "epoch": 11.597058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.012013798579573631, "kl": 0.007332473178394139, "learning_rate": 4.479910352639354e-07, "loss": 7.341773016378284e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 452.8125, "completions/min_length": 386.0, "epoch": 11.598529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.9557824730873108, "kl": 0.010057487292215228, "learning_rate": 4.4786340009774076e-07, "loss": 0.00010046185343526304, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 445.3125, "completions/min_length": 406.0, "epoch": 11.6, "frac_reward_zero_std": 1.0, "grad_norm": 0.013691301457583904, "kl": 0.008632621495053172, "learning_rate": 4.477357683661733e-07, "loss": 8.646749483887106e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/mean_length": 504.1875, "completions/min_length": 378.0, "epoch": 11.601470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.7772796750068665, "kl": 0.012409078422933817, "learning_rate": 4.4760814007764116e-07, "loss": 0.0001232229551533237, "reward": 0.9725377559661865, "reward_std": 0.05707345902919769, "rewards/DrugCombAccuracyCOTORM/mean": 0.9695784449577332, "rewards/DrugCombAccuracyCOTORM/std": 0.0893825963139534, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 7889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 420.125, "completions/min_length": 381.0, "epoch": 11.602941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.012163452804088593, "kl": 0.008623195928521454, "learning_rate": 4.474805152405522e-07, "loss": 8.622083259979263e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 465.625, "completions/min_length": 417.0, "epoch": 11.604411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.01438190508633852, "kl": 0.010168223641812801, "learning_rate": 4.4735289386331393e-07, "loss": 0.00010200864926446229, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 438.9375, "completions/min_length": 367.0, "epoch": 11.605882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.1164720058441162, "kl": 0.010933642275631428, "learning_rate": 4.472252759543338e-07, "loss": 0.00010867416858673096, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 443.9375, "completions/min_length": 357.0, "epoch": 11.60735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.014059850014746189, "kl": 0.009456556290388107, "learning_rate": 4.47097661522019e-07, "loss": 9.489941294305027e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 479.4375, "completions/min_length": 424.0, "epoch": 11.608823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.013270378112793, "kl": 0.010721610044129193, "learning_rate": 4.4697005057477634e-07, "loss": 0.00010690389171941206, "reward": 0.890625, "reward_std": 0.09219947457313538, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.1972026526927948, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.20155644416809082, "step": 7894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 491.3125, "completions/min_length": 433.0, "epoch": 11.610294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.772223174571991, "kl": 0.007184372050687671, "learning_rate": 4.468424431210125e-07, "loss": 7.222974090836942e-05, "reward": 0.6291666626930237, "reward_std": 0.14658820629119873, "rewards/DrugCombAccuracyCOTORM/mean": 0.5833333134651184, "rewards/DrugCombAccuracyCOTORM/std": 0.4791968762874603, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 500.9375, "completions/min_length": 443.0, "epoch": 11.611764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.012939279899001122, "kl": 0.009187964373268187, "learning_rate": 4.46714839169134e-07, "loss": 9.194677841151133e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 448.3125, "completions/min_length": 391.0, "epoch": 11.613235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.3161048889160156, "kl": 0.0103722820058465, "learning_rate": 4.46587238727547e-07, "loss": 0.00010337680578231812, "reward": 0.6937500238418579, "reward_std": 0.42636024951934814, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.8920949101448059, "step": 7897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 469.0625, "completions/min_length": 416.0, "epoch": 11.614705882352942, "frac_reward_zero_std": 0.0, "grad_norm": 1.2610955238342285, "kl": 0.009417594177648425, "learning_rate": 4.464596418046576e-07, "loss": 9.459257125854492e-05, "reward": 0.7875000238418579, "reward_std": 0.3837963938713074, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 451.0625, "completions/min_length": 420.0, "epoch": 11.616176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.007950859144330025, "kl": 0.007319167722016573, "learning_rate": 4.463320484088716e-07, "loss": 7.343824108829722e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/mean_length": 517.8125, "completions/min_length": 430.0, "epoch": 11.617647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.3146867752075195, "kl": 0.010364366695284843, "learning_rate": 4.462044585485943e-07, "loss": 0.0001040142888086848, "reward": 0.8074896335601807, "reward_std": 0.021758992224931717, "rewards/DrugCombAccuracyCOTORM/mean": 0.768151044845581, "rewards/DrugCombAccuracyCOTORM/std": 0.2439415454864502, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9296875, "rewards/DrugCombCoverageCOTORM/std": 0.12884704768657684, "step": 7900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 448.1875, "completions/min_length": 387.0, "epoch": 11.619117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 1.5937458276748657, "kl": 0.010977505939081311, "learning_rate": 4.4607687223223115e-07, "loss": 0.00010848045349121094, "reward": 0.5162361264228821, "reward_std": 0.3488641083240509, "rewards/DrugCombAccuracyCOTORM/mean": 0.468645840883255, "rewards/DrugCombAccuracyCOTORM/std": 0.4915468096733093, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4131944477558136, "rewards/DrugCombCoverageCOTORM/std": 0.8654237985610962, "step": 7901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 458.625, "completions/min_length": 355.0, "epoch": 11.620588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.5489798784255981, "kl": 0.01107549387961626, "learning_rate": 4.459492894681872e-07, "loss": 0.0001089051365852356, "reward": 0.8421875238418579, "reward_std": 0.3486824631690979, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 445.6875, "completions/min_length": 355.0, "epoch": 11.62205882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9591318964958191, "kl": 0.009865888860076666, "learning_rate": 4.458217102648673e-07, "loss": 9.911972301779315e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 471.6875, "completions/min_length": 414.0, "epoch": 11.623529411764705, "frac_reward_zero_std": 0.0, "grad_norm": 1.4083315134048462, "kl": 0.012319535482674837, "learning_rate": 4.45694134630676e-07, "loss": 0.0001234784722328186, "reward": 0.6187291741371155, "reward_std": 0.3269059658050537, "rewards/DrugCombAccuracyCOTORM/mean": 0.5650781393051147, "rewards/DrugCombAccuracyCOTORM/std": 0.4295169711112976, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5055250525474548, "step": 7904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 477.5625, "completions/min_length": 409.0, "epoch": 11.625, "frac_reward_zero_std": 0.0, "grad_norm": 8.607686042785645, "kl": 0.07545067265164107, "learning_rate": 4.455665625740178e-07, "loss": 0.0007558912038803101, "reward": 0.887499988079071, "reward_std": 0.318198025226593, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 424.6875, "completions/min_length": 342.0, "epoch": 11.626470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.866265594959259, "kl": 0.009842018829658628, "learning_rate": 4.4543899410329657e-07, "loss": 9.749829769134521e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 436.75, "completions/min_length": 364.0, "epoch": 11.62794117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.8524664044380188, "kl": 0.00921515584923327, "learning_rate": 4.453114292269163e-07, "loss": 9.215547470375896e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 428.4375, "completions/min_length": 338.0, "epoch": 11.629411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.008869001641869545, "kl": 0.007049572421237826, "learning_rate": 4.451838679532807e-07, "loss": 7.059550262056291e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 442.625, "completions/min_length": 397.0, "epoch": 11.630882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.012724270112812519, "kl": 0.008599849883466959, "learning_rate": 4.4505631029079324e-07, "loss": 8.593900565756485e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 409.625, "completions/min_length": 353.0, "epoch": 11.632352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.011723380535840988, "kl": 0.007160829845815897, "learning_rate": 4.449287562478569e-07, "loss": 7.095220644259825e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 451.0, "completions/min_length": 383.0, "epoch": 11.633823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.0341944694519043, "kl": 0.011127102887257934, "learning_rate": 4.448012058328749e-07, "loss": 0.0001122206449508667, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 7911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 427.125, "completions/min_length": 379.0, "epoch": 11.635294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.013523287139832973, "kl": 0.007682887255214155, "learning_rate": 4.4467365905424966e-07, "loss": 7.70848710089922e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/mean_length": 490.625, "completions/min_length": 361.0, "epoch": 11.636764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.1986199617385864, "kl": 0.007841236772947013, "learning_rate": 4.445461159203837e-07, "loss": 7.78883695602417e-05, "reward": 0.612500011920929, "reward_std": 0.22051933407783508, "rewards/DrugCombAccuracyCOTORM/mean": 0.515625, "rewards/DrugCombAccuracyCOTORM/std": 0.4229731261730194, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/mean_length": 472.1875, "completions/min_length": 368.0, "epoch": 11.638235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8986517190933228, "kl": 0.008909104974009097, "learning_rate": 4.444185764396793e-07, "loss": 8.827251440379769e-05, "reward": 0.7847999930381775, "reward_std": 0.17849022150039673, "rewards/DrugCombAccuracyCOTORM/mean": 0.7466250061988831, "rewards/DrugCombAccuracyCOTORM/std": 0.3883085548877716, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.197202667593956, "step": 7914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/mean_length": 471.25, "completions/min_length": 365.0, "epoch": 11.639705882352942, "frac_reward_zero_std": 0.0, "grad_norm": 1.4087328910827637, "kl": 0.017040069680660963, "learning_rate": 4.442910406205385e-07, "loss": 0.00017431005835533142, "reward": 0.5542928576469421, "reward_std": 0.3848339021205902, "rewards/DrugCombAccuracyCOTORM/mean": 0.5131785869598389, "rewards/DrugCombAccuracyCOTORM/std": 0.4643998146057129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.786165177822113, "step": 7915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/mean_length": 466.4375, "completions/min_length": 394.0, "epoch": 11.641176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.7955551147460938, "kl": 0.009040848119184375, "learning_rate": 4.4416350847136287e-07, "loss": 9.085063356906176e-05, "reward": 0.6606666445732117, "reward_std": 0.15625539422035217, "rewards/DrugCombAccuracyCOTORM/mean": 0.6175000071525574, "rewards/DrugCombAccuracyCOTORM/std": 0.45407047867774963, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.6666666865348816, "step": 7916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 419.875, "completions/min_length": 364.0, "epoch": 11.64264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.775436282157898, "kl": 0.008988902787677944, "learning_rate": 4.4403598000055414e-07, "loss": 9.070018859347329e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/mean_length": 467.5625, "completions/min_length": 371.0, "epoch": 11.644117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.9473915100097656, "kl": 0.009819100378081203, "learning_rate": 4.4390845521651334e-07, "loss": 9.966791549231857e-05, "reward": 0.8553333282470703, "reward_std": 0.20654159784317017, "rewards/DrugCombAccuracyCOTORM/mean": 0.8400000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.3471022844314575, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5018484592437744, "step": 7918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 486.9375, "completions/min_length": 426.0, "epoch": 11.645588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9841723442077637, "kl": 0.010392170399427414, "learning_rate": 4.437809341276415e-07, "loss": 0.00010446487431181595, "reward": 0.6476666927337646, "reward_std": 0.04396679624915123, "rewards/DrugCombAccuracyCOTORM/mean": 0.5824999809265137, "rewards/DrugCombAccuracyCOTORM/std": 0.43676844239234924, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8166666626930237, "rewards/DrugCombCoverageCOTORM/std": 0.20000000298023224, "step": 7919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 474.875, "completions/min_length": 424.0, "epoch": 11.647058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.020158687606453896, "kl": 0.008910057367756963, "learning_rate": 4.4365341674233945e-07, "loss": 8.847628487274051e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 424.5625, "completions/min_length": 361.0, "epoch": 11.648529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.008703655563294888, "kl": 0.006664932239800692, "learning_rate": 4.4352590306900774e-07, "loss": 6.671468872809783e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/mean_length": 512.9375, "completions/min_length": 434.0, "epoch": 11.65, "frac_reward_zero_std": 0.0, "grad_norm": 1.1568371057510376, "kl": 0.010356823913753033, "learning_rate": 4.4339839311604664e-07, "loss": 0.00010395795106887817, "reward": 0.7489583492279053, "reward_std": 0.3036502003669739, "rewards/DrugCombAccuracyCOTORM/mean": 0.6979166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.40008679032325745, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.2719528079032898, "step": 7922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 491.5, "completions/min_length": 430.0, "epoch": 11.651470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.017130350694060326, "kl": 0.009307980071753263, "learning_rate": 4.432708868918562e-07, "loss": 9.353452333016321e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 481.8125, "completions/min_length": 425.0, "epoch": 11.652941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.011736825108528137, "kl": 0.0065668439492583275, "learning_rate": 4.4314338440483614e-07, "loss": 6.543716881424189e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 399.0, "completions/min_length": 333.0, "epoch": 11.654411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.014740864746272564, "kl": 0.009873166214674711, "learning_rate": 4.43015885663386e-07, "loss": 9.788629540707916e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 441.5625, "completions/min_length": 393.0, "epoch": 11.655882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.01397154200822115, "kl": 0.00795612414367497, "learning_rate": 4.4288839067590513e-07, "loss": 7.954677857924253e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 428.0625, "completions/min_length": 351.0, "epoch": 11.657352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.08575689047574997, "kl": 0.009609176544472575, "learning_rate": 4.427608994507925e-07, "loss": 9.366923768538982e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 7927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 462.75, "completions/min_length": 404.0, "epoch": 11.658823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.009450912475586, "kl": 0.010900634340941906, "learning_rate": 4.4263341199644694e-07, "loss": 0.00010841339826583862, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 7928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/mean_length": 486.0, "completions/min_length": 438.0, "epoch": 11.660294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.06909526884555817, "kl": 0.00977577653247863, "learning_rate": 4.425059283212672e-07, "loss": 9.680945368018001e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/mean_length": 462.4375, "completions/min_length": 392.0, "epoch": 11.661764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.05007317662239075, "kl": 0.009455975610762835, "learning_rate": 4.423784484336512e-07, "loss": 9.594310540705919e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/mean_length": 417.3125, "completions/min_length": 360.0, "epoch": 11.663235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.011740004643797874, "kl": 0.008307807380333543, "learning_rate": 4.422509723419973e-07, "loss": 8.261686161858961e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/mean_length": 522.0625, "completions/min_length": 433.0, "epoch": 11.66470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9178586006164551, "kl": 0.01011943118646741, "learning_rate": 4.4212350005470317e-07, "loss": 0.000101529061794281, "reward": 0.5, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 7932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 439.375, "completions/min_length": 392.0, "epoch": 11.666176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.0182870551943779, "kl": 0.010290020843967795, "learning_rate": 4.4199603158016634e-07, "loss": 0.00010285030293744057, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 467.75, "completions/min_length": 407.0, "epoch": 11.66764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9032634496688843, "kl": 0.008241106639616191, "learning_rate": 4.418685669267842e-07, "loss": 8.25623283162713e-05, "reward": 0.45000001788139343, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 7934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 457.5, "completions/min_length": 375.0, "epoch": 11.669117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.010285414755344391, "kl": 0.007507556118071079, "learning_rate": 4.4174110610295386e-07, "loss": 7.506214751629159e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 524.125, "completions/min_length": 449.0, "epoch": 11.670588235294117, "frac_reward_zero_std": 0.0, "grad_norm": 1.2395927906036377, "kl": 0.0074182761600241065, "learning_rate": 4.4161364911707186e-07, "loss": 7.449835538864136e-05, "reward": 0.8250000476837158, "reward_std": 0.2505747675895691, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 7936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 471.25, "completions/min_length": 430.0, "epoch": 11.672058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.8872299194335938, "kl": 0.010009934892877936, "learning_rate": 4.4148619597753503e-07, "loss": 9.965440403902903e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 7937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 465.5625, "completions/min_length": 424.0, "epoch": 11.673529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.857933759689331, "kl": 0.008352352189831436, "learning_rate": 4.413587466927394e-07, "loss": 8.448523294646293e-05, "reward": 0.9802083373069763, "reward_std": 0.055979274213314056, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 7938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 415.875, "completions/min_length": 383.0, "epoch": 11.675, "frac_reward_zero_std": 1.0, "grad_norm": 0.032355889678001404, "kl": 0.008901903755031526, "learning_rate": 4.412313012710812e-07, "loss": 8.912431803764775e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 478.9375, "completions/min_length": 423.0, "epoch": 11.676470588235293, "frac_reward_zero_std": 0.0, "grad_norm": 1.7420746088027954, "kl": 0.008651072508655488, "learning_rate": 4.411038597209561e-07, "loss": 8.660554885864258e-05, "reward": 0.6000000238418579, "reward_std": 0.2828426957130432, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/mean_length": 471.625, "completions/min_length": 371.0, "epoch": 11.677941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.2559350728988647, "kl": 0.01119672879576683, "learning_rate": 4.409764220507598e-07, "loss": 0.00011105090379714966, "reward": 0.643750011920929, "reward_std": 0.3442630469799042, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 463.4375, "completions/min_length": 428.0, "epoch": 11.679411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.0801329612731934, "kl": 0.01147274486720562, "learning_rate": 4.408489882688874e-07, "loss": 0.00011423183605074883, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/mean_length": 414.875, "completions/min_length": 381.0, "epoch": 11.680882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.06058311462402344, "kl": 0.01150244870223105, "learning_rate": 4.4072155838373386e-07, "loss": 0.00011471710604382679, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 7943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/mean_length": 488.375, "completions/min_length": 430.0, "epoch": 11.68235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.94828200340271, "kl": 0.012917227344587445, "learning_rate": 4.4059413240369406e-07, "loss": 0.00012934808910358697, "reward": 0.7802083492279053, "reward_std": 0.20410895347595215, "rewards/DrugCombAccuracyCOTORM/mean": 0.7291666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.4425306022167206, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.08539126068353653, "step": 7944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 481.3125, "completions/min_length": 452.0, "epoch": 11.683823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.010865995660424232, "kl": 0.008432422648184001, "learning_rate": 4.404667103371625e-07, "loss": 8.41603905428201e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/mean_length": 466.6875, "completions/min_length": 388.0, "epoch": 11.685294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9387685060501099, "kl": 0.01059673388954252, "learning_rate": 4.4033929219253346e-07, "loss": 0.00010632479097694159, "reward": 0.9470833539962769, "reward_std": 0.057829439640045166, "rewards/DrugCombAccuracyCOTORM/mean": 0.9416666626930237, "rewards/DrugCombAccuracyCOTORM/std": 0.10852546989917755, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.13437095284461975, "step": 7946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 457.0625, "completions/min_length": 402.0, "epoch": 11.686764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.017128104344010353, "kl": 0.010651644319295883, "learning_rate": 4.402118779782009e-07, "loss": 0.00010598880180623382, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 464.6875, "completions/min_length": 393.0, "epoch": 11.688235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 0.9035386443138123, "kl": 0.00974379840772599, "learning_rate": 4.4008446770255847e-07, "loss": 9.725242853164673e-05, "reward": 0.9462499618530273, "reward_std": 0.15202796459197998, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9624999761581421, "rewards/DrugCombCoverageCOTORM/std": 0.15000000596046448, "step": 7948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 472.5, "completions/min_length": 398.0, "epoch": 11.689705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.8129538893699646, "kl": 0.008614957449026406, "learning_rate": 4.399570613739996e-07, "loss": 8.592754602432251e-05, "reward": 0.9802083373069763, "reward_std": 0.055979274213314056, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 7949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 450.5, "completions/min_length": 395.0, "epoch": 11.691176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.1434229612350464, "kl": 0.011938894400373101, "learning_rate": 4.398296590009176e-07, "loss": 0.00011959284165641293, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 438.0625, "completions/min_length": 362.0, "epoch": 11.69264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8328555226325989, "kl": 0.012693943455815315, "learning_rate": 4.3970226059170545e-07, "loss": 0.00012787431478500366, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 433.6875, "completions/min_length": 377.0, "epoch": 11.694117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.013824092224240303, "kl": 0.007686561439186335, "learning_rate": 4.3957486615475576e-07, "loss": 7.705576717853546e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 404.375, "completions/min_length": 348.0, "epoch": 11.695588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.012224426493048668, "kl": 0.00694528769236058, "learning_rate": 4.39447475698461e-07, "loss": 6.860509893158451e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 441.8125, "completions/min_length": 378.0, "epoch": 11.697058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.3286653757095337, "kl": 0.016672012396156788, "learning_rate": 4.3932008923121326e-07, "loss": 0.0001677553227636963, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 7954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 437.75, "completions/min_length": 399.0, "epoch": 11.698529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.0075800130143761635, "kl": 0.007397940498776734, "learning_rate": 4.391927067614045e-07, "loss": 7.422958879033104e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/mean_length": 519.9375, "completions/min_length": 420.0, "epoch": 11.7, "frac_reward_zero_std": 0.0, "grad_norm": 1.372170329093933, "kl": 0.011181040550582111, "learning_rate": 4.390653282974263e-07, "loss": 0.00011069327592849731, "reward": 0.8017857074737549, "reward_std": 0.2559588551521301, "rewards/DrugCombAccuracyCOTORM/mean": 0.7678571939468384, "rewards/DrugCombAccuracyCOTORM/std": 0.3864191174507141, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 447.25, "completions/min_length": 393.0, "epoch": 11.701470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.008589287288486958, "kl": 0.006546386051923037, "learning_rate": 4.389379538476701e-07, "loss": 6.578756438102573e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/mean_length": 489.625, "completions/min_length": 403.0, "epoch": 11.702941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.5226393938064575, "kl": 0.03139424417167902, "learning_rate": 4.388105834205269e-07, "loss": 0.0003083348274230957, "reward": 0.6248958110809326, "reward_std": 0.3167058825492859, "rewards/DrugCombAccuracyCOTORM/mean": 0.557812511920929, "rewards/DrugCombAccuracyCOTORM/std": 0.41238322854042053, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8020833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.3232860863208771, "step": 7958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/mean_length": 510.0, "completions/min_length": 437.0, "epoch": 11.704411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 1.449294924736023, "kl": 0.010582043323665857, "learning_rate": 4.3868321702438775e-07, "loss": 0.00010615214705467224, "reward": 0.5383184552192688, "reward_std": 0.28017234802246094, "rewards/DrugCombAccuracyCOTORM/mean": 0.4411271810531616, "rewards/DrugCombAccuracyCOTORM/std": 0.4179686903953552, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.17612075805664062, "step": 7959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 442.1875, "completions/min_length": 336.0, "epoch": 11.705882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.01427539438009262, "kl": 0.009035094175487757, "learning_rate": 4.38555854667643e-07, "loss": 9.002639126265422e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 460.875, "completions/min_length": 423.0, "epoch": 11.70735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.007558468263596296, "kl": 0.005969295860268176, "learning_rate": 4.384284963586831e-07, "loss": 5.9640882682288066e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 422.5625, "completions/min_length": 355.0, "epoch": 11.708823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.082523226737976, "kl": 0.00886511744465679, "learning_rate": 4.38301142105898e-07, "loss": 8.834823529468849e-05, "reward": 0.5625, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 7962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 406.0, "completions/min_length": 363.0, "epoch": 11.71029411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.07423565536737442, "kl": 0.011595039162784815, "learning_rate": 4.3817379191767745e-07, "loss": 0.00011566386820049956, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 465.8125, "completions/min_length": 411.0, "epoch": 11.711764705882352, "frac_reward_zero_std": 0.5, "grad_norm": 0.9500048756599426, "kl": 0.010665975511074066, "learning_rate": 4.380464458024112e-07, "loss": 0.00010753273090813309, "reward": 0.906833291053772, "reward_std": 0.17410697042942047, "rewards/DrugCombAccuracyCOTORM/mean": 0.8887500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.30663496255874634, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 7964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 450.4375, "completions/min_length": 409.0, "epoch": 11.713235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9748212099075317, "kl": 0.012135214870795608, "learning_rate": 4.379191037684882e-07, "loss": 0.00012111291289329529, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 462.375, "completions/min_length": 423.0, "epoch": 11.714705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.022792935371399, "kl": 0.010243940399959683, "learning_rate": 4.3779176582429747e-07, "loss": 0.00010223314166069031, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 455.25, "completions/min_length": 396.0, "epoch": 11.716176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.2419756650924683, "kl": 0.011599870631471276, "learning_rate": 4.3766443197822786e-07, "loss": 0.00011625513434410095, "reward": 0.3802083432674408, "reward_std": 0.2842344641685486, "rewards/DrugCombAccuracyCOTORM/mean": 0.2604166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.3275540769100189, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.71875, "rewards/DrugCombCoverageCOTORM/std": 0.682367205619812, "step": 7967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 428.5, "completions/min_length": 365.0, "epoch": 11.717647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.013702776283025742, "kl": 0.008793223882094026, "learning_rate": 4.375371022386677e-07, "loss": 8.77911297720857e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 479.875, "completions/min_length": 430.0, "epoch": 11.719117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.1087487936019897, "kl": 0.013606612337753177, "learning_rate": 4.3740977661400516e-07, "loss": 0.0001355261483695358, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 472.1875, "completions/min_length": 419.0, "epoch": 11.720588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.012162866070866585, "kl": 0.006817043060436845, "learning_rate": 4.372824551126283e-07, "loss": 6.824522279202938e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 407.0, "completions/min_length": 337.0, "epoch": 11.722058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.010098131373524666, "kl": 0.007193336263298988, "learning_rate": 4.371551377429244e-07, "loss": 7.179623935371637e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 457.3125, "completions/min_length": 395.0, "epoch": 11.723529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.2999866008758545, "kl": 0.009017394040711224, "learning_rate": 4.3702782451328104e-07, "loss": 9.00663435459137e-05, "reward": 0.6875, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 7972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 457.0625, "completions/min_length": 399.0, "epoch": 11.725, "frac_reward_zero_std": 1.0, "grad_norm": 0.04888520389795303, "kl": 0.011332035064697266, "learning_rate": 4.369005154320852e-07, "loss": 0.00011201230518054217, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 490.8125, "completions/min_length": 448.0, "epoch": 11.726470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8938722610473633, "kl": 0.011160494526848197, "learning_rate": 4.367732105077237e-07, "loss": 0.00011163949966430664, "reward": 0.6630622148513794, "reward_std": 0.010236070491373539, "rewards/DrugCombAccuracyCOTORM/mean": 0.6027535200119019, "rewards/DrugCombAccuracyCOTORM/std": 0.4104463756084442, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.80859375, "rewards/DrugCombCoverageCOTORM/std": 0.20276492834091187, "step": 7974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/mean_length": 501.0625, "completions/min_length": 425.0, "epoch": 11.727941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 0.9401534795761108, "kl": 0.009507252252660692, "learning_rate": 4.366459097485831e-07, "loss": 9.466707706451416e-05, "reward": 0.6200000047683716, "reward_std": 0.04073607921600342, "rewards/DrugCombAccuracyCOTORM/mean": 0.5458333492279053, "rewards/DrugCombAccuracyCOTORM/std": 0.4741925001144409, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 7975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 417.5, "completions/min_length": 365.0, "epoch": 11.729411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.031798817217350006, "kl": 0.008828563732095063, "learning_rate": 4.3651861316304976e-07, "loss": 8.90097362571396e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/mean_length": 498.9375, "completions/min_length": 453.0, "epoch": 11.730882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.8244976997375488, "kl": 0.009444827563129365, "learning_rate": 4.363913207595095e-07, "loss": 9.479653817834333e-05, "reward": 0.8766666650772095, "reward_std": 0.025305744260549545, "rewards/DrugCombAccuracyCOTORM/mean": 0.856249988079071, "rewards/DrugCombAccuracyCOTORM/std": 0.1546267867088318, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.08606630563735962, "step": 7977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 440.125, "completions/min_length": 386.0, "epoch": 11.73235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.015623449347913265, "kl": 0.010309179546311498, "learning_rate": 4.3626403254634793e-07, "loss": 0.00010309845674782991, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 435.0625, "completions/min_length": 330.0, "epoch": 11.733823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.010916514322161674, "kl": 0.009196265484206378, "learning_rate": 4.3613674853195066e-07, "loss": 9.217939077643678e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 440.0, "completions/min_length": 359.0, "epoch": 11.735294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.8841308355331421, "kl": 0.008284820360131562, "learning_rate": 4.3600946872470273e-07, "loss": 8.320063352584839e-05, "reward": 0.8767499923706055, "reward_std": 0.17010116577148438, "rewards/DrugCombAccuracyCOTORM/mean": 0.8537499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.31442803144454956, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.13437095284461975, "step": 7980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/mean_length": 492.9375, "completions/min_length": 456.0, "epoch": 11.736764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.014637814834713936, "kl": 0.00818747456651181, "learning_rate": 4.358821931329891e-07, "loss": 8.193388930521905e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 484.6875, "completions/min_length": 434.0, "epoch": 11.738235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.5732024908065796, "kl": 0.012183927930891514, "learning_rate": 4.3575492176519445e-07, "loss": 0.00012154877185821533, "reward": 0.5874999761581421, "reward_std": 0.2920154333114624, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 7982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/mean_length": 370.6875, "completions/min_length": 297.0, "epoch": 11.739705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.025944186374545097, "kl": 0.008939664461649954, "learning_rate": 4.356276546297029e-07, "loss": 8.915048965718597e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/mean_length": 467.0625, "completions/min_length": 360.0, "epoch": 11.741176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.9198592901229858, "kl": 0.009151967475190759, "learning_rate": 4.3550039173489843e-07, "loss": 9.244475222658366e-05, "reward": 0.8509583473205566, "reward_std": 0.19233578443527222, "rewards/DrugCombAccuracyCOTORM/mean": 0.8332291841506958, "rewards/DrugCombAccuracyCOTORM/std": 0.3279569149017334, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.5072392821311951, "step": 7984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/mean_length": 491.8125, "completions/min_length": 400.0, "epoch": 11.742647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.9733045697212219, "kl": 0.014659723499789834, "learning_rate": 4.3537313308916504e-07, "loss": 0.00014699001621920615, "reward": 0.3499999940395355, "reward_std": 0.26726123690605164, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 7985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 436.3125, "completions/min_length": 383.0, "epoch": 11.744117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.03977527469396591, "kl": 0.011053630616515875, "learning_rate": 4.35245878700886e-07, "loss": 0.00011042854021070525, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 478.375, "completions/min_length": 423.0, "epoch": 11.745588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.3026074171066284, "kl": 0.009079478681087494, "learning_rate": 4.3511862857844467e-07, "loss": 8.991360664367676e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 7987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 496.1875, "completions/min_length": 426.0, "epoch": 11.74705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.022747816517949104, "kl": 0.007828375324606895, "learning_rate": 4.3499138273022397e-07, "loss": 7.797694706823677e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 446.875, "completions/min_length": 385.0, "epoch": 11.748529411764705, "frac_reward_zero_std": 0.0, "grad_norm": 1.6225801706314087, "kl": 0.011912587331607938, "learning_rate": 4.3486414116460635e-07, "loss": 0.00011925399303436279, "reward": 0.8687499761581421, "reward_std": 0.2491326928138733, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.704154372215271, "step": 7989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/mean_length": 538.1875, "completions/min_length": 480.0, "epoch": 11.75, "frac_reward_zero_std": 0.5, "grad_norm": 0.8984041810035706, "kl": 0.007359497249126434, "learning_rate": 4.347369038899743e-07, "loss": 7.328798528760672e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 455.125, "completions/min_length": 416.0, "epoch": 11.751470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.8849637508392334, "kl": 0.006843902403488755, "learning_rate": 4.3460967091470984e-07, "loss": 6.834417581558228e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 444.0625, "completions/min_length": 379.0, "epoch": 11.75294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.009975215420126915, "kl": 0.006327057955786586, "learning_rate": 4.344824422471948e-07, "loss": 6.313658377621323e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 474.0625, "completions/min_length": 413.0, "epoch": 11.754411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.089708924293518, "kl": 0.010083975736051798, "learning_rate": 4.3435521789581066e-07, "loss": 0.000100698322057724, "reward": 0.6107000112533569, "reward_std": 0.36690303683280945, "rewards/DrugCombAccuracyCOTORM/mean": 0.5164999961853027, "rewards/DrugCombAccuracyCOTORM/std": 0.503411054611206, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9750000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.06831300258636475, "step": 7993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 457.625, "completions/min_length": 386.0, "epoch": 11.755882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.008810536935925484, "kl": 0.00788228155579418, "learning_rate": 4.3422799786893873e-07, "loss": 7.921259384602308e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 476.0625, "completions/min_length": 421.0, "epoch": 11.757352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.1035330295562744, "kl": 0.009926187805831432, "learning_rate": 4.341007821749597e-07, "loss": 9.94047150015831e-05, "reward": 0.7837499976158142, "reward_std": 0.2118549942970276, "rewards/DrugCombAccuracyCOTORM/mean": 0.7479166984558105, "rewards/DrugCombAccuracyCOTORM/std": 0.42078039050102234, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5013870000839233, "step": 7995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 472.6875, "completions/min_length": 423.0, "epoch": 11.758823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.015060305595398, "kl": 0.009607442189007998, "learning_rate": 4.3397357082225437e-07, "loss": 9.631332068238407e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 7996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 466.0625, "completions/min_length": 422.0, "epoch": 11.760294117647058, "frac_reward_zero_std": 0.0, "grad_norm": 1.5969665050506592, "kl": 0.008833874948322773, "learning_rate": 4.338463638192032e-07, "loss": 8.868053555488586e-05, "reward": 0.7678999900817871, "reward_std": 0.3739206790924072, "rewards/DrugCombAccuracyCOTORM/mean": 0.7317500114440918, "rewards/DrugCombAccuracyCOTORM/std": 0.4166903793811798, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.824999988079071, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 7997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 421.625, "completions/min_length": 337.0, "epoch": 11.761764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.011990193277597427, "kl": 0.008731473819352686, "learning_rate": 4.337191611741861e-07, "loss": 8.725134830456227e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 7998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/mean_length": 505.375, "completions/min_length": 428.0, "epoch": 11.763235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.3405200242996216, "kl": 0.01310032862238586, "learning_rate": 4.335919628955829e-07, "loss": 0.00013297423720359802, "reward": 0.6114583611488342, "reward_std": 0.2584834694862366, "rewards/DrugCombAccuracyCOTORM/mean": 0.5208333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.438325971364975, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9479166269302368, "rewards/DrugCombCoverageCOTORM/std": 0.07978560030460358, "step": 7999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/mean_length": 509.1875, "completions/min_length": 393.0, "epoch": 11.764705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.0906596183776855, "kl": 0.009297001641243696, "learning_rate": 4.334647689917733e-07, "loss": 9.354632493341342e-05, "reward": 0.8270833492279053, "reward_std": 0.1119585856795311, "rewards/DrugCombAccuracyCOTORM/mean": 0.7916666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.2687419056892395, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 498.875, "completions/min_length": 460.0, "epoch": 11.766176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.018193444237113, "kl": 0.008486157050356269, "learning_rate": 4.333375794711362e-07, "loss": 8.495192741975188e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 483.75, "completions/min_length": 414.0, "epoch": 11.76764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.4446338415145874, "kl": 0.011333173839375377, "learning_rate": 4.3321039434205064e-07, "loss": 0.00011344254016876221, "reward": 0.3139166831970215, "reward_std": 0.26930999755859375, "rewards/DrugCombAccuracyCOTORM/mean": 0.20749999582767487, "rewards/DrugCombAccuracyCOTORM/std": 0.326751708984375, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.75, "step": 8002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 427.625, "completions/min_length": 370.0, "epoch": 11.769117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.9383581280708313, "kl": 0.010388546157628298, "learning_rate": 4.330832136128953e-07, "loss": 0.00010398119775345549, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 456.1875, "completions/min_length": 405.0, "epoch": 11.770588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.046647310256958, "kl": 0.016753707313910127, "learning_rate": 4.3295603729204855e-07, "loss": 0.00017305556684732437, "reward": 0.637499988079071, "reward_std": 0.22638463973999023, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 8004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 413.0, "completions/min_length": 364.0, "epoch": 11.772058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 1.1065421104431152, "kl": 0.008585577365010977, "learning_rate": 4.328288653878883e-07, "loss": 8.541345596313477e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/mean_length": 515.4375, "completions/min_length": 384.0, "epoch": 11.773529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.2487400770187378, "kl": 0.016527112806215882, "learning_rate": 4.327016979087926e-07, "loss": 0.00016860663890838623, "reward": 0.45625001192092896, "reward_std": 0.3661186695098877, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 8006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 410.1875, "completions/min_length": 337.0, "epoch": 11.775, "frac_reward_zero_std": 1.0, "grad_norm": 0.015099395997822285, "kl": 0.007058950490318239, "learning_rate": 4.325745348631386e-07, "loss": 7.067376282066107e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/mean_length": 405.625, "completions/min_length": 317.0, "epoch": 11.776470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.014746521599590778, "kl": 0.007505180547013879, "learning_rate": 4.324473762593036e-07, "loss": 7.433665450662374e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 462.25, "completions/min_length": 399.0, "epoch": 11.777941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.007739777211099863, "kl": 0.007698615314438939, "learning_rate": 4.323202221056645e-07, "loss": 7.72231724113226e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 450.4375, "completions/min_length": 396.0, "epoch": 11.779411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9581412672996521, "kl": 0.01231936423573643, "learning_rate": 4.321930724105979e-07, "loss": 0.0001231257920153439, "reward": 0.7425000071525574, "reward_std": 0.16180676221847534, "rewards/DrugCombAccuracyCOTORM/mean": 0.6937500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.41161268949508667, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.1666666567325592, "step": 8010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/mean_length": 513.3125, "completions/min_length": 414.0, "epoch": 11.780882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.0594370365142822, "kl": 0.00893443450331688, "learning_rate": 4.320659271824801e-07, "loss": 8.911395707400516e-05, "reward": 0.8142889142036438, "reward_std": 0.1976170837879181, "rewards/DrugCombAccuracyCOTORM/mean": 0.781749963760376, "rewards/DrugCombAccuracyCOTORM/std": 0.39315468072891235, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8888888955116272, "rewards/DrugCombCoverageCOTORM/std": 0.19876159727573395, "step": 8011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 460.0, "completions/min_length": 399.0, "epoch": 11.782352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.7668905854225159, "kl": 0.009245510911569, "learning_rate": 4.319387864296872e-07, "loss": 9.187310934066772e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 462.0, "completions/min_length": 394.0, "epoch": 11.783823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.0948774814605713, "kl": 0.00907700345851481, "learning_rate": 4.318116501605947e-07, "loss": 9.006461914395913e-05, "reward": 0.7593749761581421, "reward_std": 0.21196536719799042, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.394405335187912, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.59375, "rewards/DrugCombCoverageCOTORM/std": 0.6884463429450989, "step": 8013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 477.5625, "completions/min_length": 421.0, "epoch": 11.785294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.7553535103797913, "kl": 0.00982756819576025, "learning_rate": 4.316845183835781e-07, "loss": 9.781122207641602e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 420.8125, "completions/min_length": 386.0, "epoch": 11.786764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.007951564155519009, "kl": 0.006066218833439052, "learning_rate": 4.315573911070125e-07, "loss": 6.0709826357197016e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 430.1875, "completions/min_length": 332.0, "epoch": 11.788235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.01659989356994629, "kl": 0.009879635646939278, "learning_rate": 4.3143026833927287e-07, "loss": 9.868374763755128e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 445.375, "completions/min_length": 400.0, "epoch": 11.78970588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.6181012392044067, "kl": 0.010354983271099627, "learning_rate": 4.313031500887336e-07, "loss": 0.00010319799184799194, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 449.625, "completions/min_length": 390.0, "epoch": 11.791176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.1755585670471191, "kl": 0.010618755011819303, "learning_rate": 4.3117603636376904e-07, "loss": 0.00010634503269102424, "reward": 0.9833333492279053, "reward_std": 0.047140445560216904, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/mean_length": 518.25, "completions/min_length": 383.0, "epoch": 11.79264705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.6160486936569214, "kl": 0.013596170814707875, "learning_rate": 4.310489271727529e-07, "loss": 0.00013572722673416138, "reward": 0.1584966629743576, "reward_std": 0.07849070429801941, "rewards/DrugCombAccuracyCOTORM/mean": 0.07259999960660934, "rewards/DrugCombAccuracyCOTORM/std": 0.11239681392908096, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.004166692495346069, "rewards/DrugCombCoverageCOTORM/std": 0.915211021900177, "step": 8019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/mean_length": 472.75, "completions/min_length": 348.0, "epoch": 11.794117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.14450885355472565, "kl": 0.012934138416312635, "learning_rate": 4.30921822524059e-07, "loss": 0.00013232615310698748, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 8020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 456.9375, "completions/min_length": 394.0, "epoch": 11.795588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.97966468334198, "kl": 0.010669952724128962, "learning_rate": 4.307947224260606e-07, "loss": 0.00010568437573965639, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 448.1875, "completions/min_length": 415.0, "epoch": 11.797058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.2465256452560425, "kl": 0.01392647810280323, "learning_rate": 4.3066762688713074e-07, "loss": 0.00014099646068643779, "reward": 0.8500000238418579, "reward_std": 0.20701967179775238, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 417.125, "completions/min_length": 374.0, "epoch": 11.798529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.012176706455647945, "kl": 0.007697076536715031, "learning_rate": 4.3054053591564216e-07, "loss": 7.649152394151315e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 447.3125, "completions/min_length": 378.0, "epoch": 11.8, "frac_reward_zero_std": 1.0, "grad_norm": 0.013725398108363152, "kl": 0.007591216708533466, "learning_rate": 4.304134495199674e-07, "loss": 7.534456381108612e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 445.625, "completions/min_length": 396.0, "epoch": 11.801470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 0.8513509035110474, "kl": 0.009934741887263954, "learning_rate": 4.302863677084784e-07, "loss": 9.866541950032115e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 8025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/mean_length": 378.375, "completions/min_length": 324.0, "epoch": 11.802941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.014730474911630154, "kl": 0.009853321476839483, "learning_rate": 4.3015929048954703e-07, "loss": 9.761948604136705e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 487.0, "completions/min_length": 415.0, "epoch": 11.804411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.015691662207245827, "kl": 0.006595569895580411, "learning_rate": 4.300322178715449e-07, "loss": 6.601028144359589e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 419.9375, "completions/min_length": 347.0, "epoch": 11.805882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.04854876548051834, "kl": 0.010010521044023335, "learning_rate": 4.299051498628432e-07, "loss": 9.96616727206856e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 456.5625, "completions/min_length": 409.0, "epoch": 11.80735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1474545001983643, "kl": 0.008754707989282906, "learning_rate": 4.297780864718128e-07, "loss": 8.758821059018373e-05, "reward": 0.9102500081062317, "reward_std": 0.1661846935749054, "rewards/DrugCombAccuracyCOTORM/mean": 0.8956249952316284, "rewards/DrugCombAccuracyCOTORM/std": 0.28520679473876953, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.17078252136707306, "step": 8029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/mean_length": 472.1875, "completions/min_length": 375.0, "epoch": 11.808823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 1.3804713487625122, "kl": 0.012872610008344054, "learning_rate": 4.296510277068245e-07, "loss": 0.00012694299221038818, "reward": 0.35747918486595154, "reward_std": 0.3331785202026367, "rewards/DrugCombAccuracyCOTORM/mean": 0.21833333373069763, "rewards/DrugCombAccuracyCOTORM/std": 0.3960059583187103, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.828125, "rewards/DrugCombCoverageCOTORM/std": 0.5221650004386902, "step": 8030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 428.5625, "completions/min_length": 359.0, "epoch": 11.810294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.03248411789536476, "kl": 0.013139119138941169, "learning_rate": 4.2952397357624833e-07, "loss": 0.00013173937622923404, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 423.9375, "completions/min_length": 373.0, "epoch": 11.811764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.011112317442893982, "kl": 0.00750725413672626, "learning_rate": 4.293969240884544e-07, "loss": 7.53865169826895e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 411.1875, "completions/min_length": 352.0, "epoch": 11.813235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.03271573409438133, "kl": 0.011443947907537222, "learning_rate": 4.292698792518125e-07, "loss": 0.00011367032129783183, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 484.3125, "completions/min_length": 431.0, "epoch": 11.814705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.019191740080714226, "kl": 0.009961423929780722, "learning_rate": 4.291428390746919e-07, "loss": 9.918354771798477e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 465.1875, "completions/min_length": 368.0, "epoch": 11.816176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.022552944719791412, "kl": 0.010017600492574275, "learning_rate": 4.2901580356546176e-07, "loss": 0.00010091255535371602, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 440.5625, "completions/min_length": 386.0, "epoch": 11.81764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.011640087701380253, "kl": 0.009819221682846546, "learning_rate": 4.28888772732491e-07, "loss": 9.79660835582763e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 443.3125, "completions/min_length": 330.0, "epoch": 11.819117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.1386958360671997, "kl": 0.01085561781655997, "learning_rate": 4.2876174658414777e-07, "loss": 0.0001085854964912869, "reward": 0.4937500059604645, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 8037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/mean_length": 471.0625, "completions/min_length": 376.0, "epoch": 11.820588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.1706761121749878, "kl": 0.012015682994388044, "learning_rate": 4.286347251288004e-07, "loss": 0.00012070685625076294, "reward": 0.708160400390625, "reward_std": 0.13051016628742218, "rewards/DrugCombAccuracyCOTORM/mean": 0.6478958129882812, "rewards/DrugCombAccuracyCOTORM/std": 0.42478808760643005, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8984375, "rewards/DrugCombCoverageCOTORM/std": 0.12263385951519012, "step": 8038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 478.25, "completions/min_length": 378.0, "epoch": 11.822058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.2100666761398315, "kl": 0.009124342584982514, "learning_rate": 4.2850770837481675e-07, "loss": 9.185820817947388e-05, "reward": 0.887499988079071, "reward_std": 0.318198025226593, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 465.5, "completions/min_length": 401.0, "epoch": 11.823529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.023223232477903366, "kl": 0.010476330993697047, "learning_rate": 4.2838069633056436e-07, "loss": 0.00010322424350306392, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/mean_length": 476.125, "completions/min_length": 402.0, "epoch": 11.825, "frac_reward_zero_std": 1.0, "grad_norm": 0.013700938783586025, "kl": 0.010504195000976324, "learning_rate": 4.2825368900441037e-07, "loss": 0.00010443449718877673, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 460.375, "completions/min_length": 420.0, "epoch": 11.826470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9543545842170715, "kl": 0.011444475501775742, "learning_rate": 4.28126686404722e-07, "loss": 0.0001136243372457102, "reward": 0.8374999761581421, "reward_std": 0.22638462483882904, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 436.3125, "completions/min_length": 393.0, "epoch": 11.827941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.013675571419298649, "kl": 0.007634135661646724, "learning_rate": 4.2799968853986545e-07, "loss": 7.575217023259029e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 491.75, "completions/min_length": 438.0, "epoch": 11.829411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.0067946910858154, "kl": 0.011755307437852025, "learning_rate": 4.2787269541820723e-07, "loss": 0.00011779760825447738, "reward": 0.8500000238418579, "reward_std": 0.2070196568965912, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 425.9375, "completions/min_length": 373.0, "epoch": 11.830882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.8304612040519714, "kl": 0.007991105667315423, "learning_rate": 4.2774570704811337e-07, "loss": 7.98550681793131e-05, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/mean_length": 542.8125, "completions/min_length": 427.0, "epoch": 11.83235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.613372266292572, "kl": 0.010024838149547577, "learning_rate": 4.276187234379494e-07, "loss": 0.00010135117918252945, "reward": 0.6987500190734863, "reward_std": 0.1859675496816635, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.987500011920929, "rewards/DrugCombCoverageCOTORM/std": 0.05000000074505806, "step": 8046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 457.5, "completions/min_length": 424.0, "epoch": 11.833823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.010102915577590466, "kl": 0.0077515627490356565, "learning_rate": 4.274917445960808e-07, "loss": 7.743881724309176e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 437.375, "completions/min_length": 383.0, "epoch": 11.83529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.0243115425109863, "kl": 0.009705367614515126, "learning_rate": 4.2736477053087263e-07, "loss": 9.74014401435852e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 461.5, "completions/min_length": 415.0, "epoch": 11.836764705882352, "frac_reward_zero_std": 1.0, "grad_norm": 0.041679322719573975, "kl": 0.009895928087644279, "learning_rate": 4.272378012506895e-07, "loss": 9.648367995396256e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 435.1875, "completions/min_length": 389.0, "epoch": 11.838235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.7396448254585266, "kl": 0.0073698851047083735, "learning_rate": 4.2711083676389585e-07, "loss": 7.359776645898819e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/mean_length": 524.5, "completions/min_length": 428.0, "epoch": 11.839705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 1.5581635236740112, "kl": 0.016386531991884112, "learning_rate": 4.2698387707885587e-07, "loss": 0.0001649037003517151, "reward": 0.542187511920929, "reward_std": 0.4313102960586548, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.6291528940200806, "step": 8051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 442.5625, "completions/min_length": 392.0, "epoch": 11.841176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.009441846050322056, "kl": 0.0061043285531923175, "learning_rate": 4.268569222039332e-07, "loss": 6.0767946706619114e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 437.5625, "completions/min_length": 400.0, "epoch": 11.842647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.015830161049962044, "kl": 0.009152117418125272, "learning_rate": 4.267299721474915e-07, "loss": 9.063664037967101e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 502.8125, "completions/min_length": 462.0, "epoch": 11.844117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.8781735301017761, "kl": 0.00926734774839133, "learning_rate": 4.2660302691789365e-07, "loss": 9.327526640845463e-05, "reward": 0.987500011920929, "reward_std": 0.023145489394664764, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 8054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 418.1875, "completions/min_length": 378.0, "epoch": 11.845588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.015501940622925758, "kl": 0.00874763319734484, "learning_rate": 4.264760865235027e-07, "loss": 8.779136260272935e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/mean_length": 484.4375, "completions/min_length": 354.0, "epoch": 11.847058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.013115037232637405, "kl": 0.007697812397964299, "learning_rate": 4.2634915097268117e-07, "loss": 7.754141552140936e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 435.125, "completions/min_length": 373.0, "epoch": 11.848529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.023521874099969864, "kl": 0.0103582504671067, "learning_rate": 4.2622222027379124e-07, "loss": 0.00010374642442911863, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 430.1875, "completions/min_length": 392.0, "epoch": 11.85, "frac_reward_zero_std": 1.0, "grad_norm": 0.013041811063885689, "kl": 0.008251810446381569, "learning_rate": 4.2609529443519464e-07, "loss": 8.238259033532813e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 466.3125, "completions/min_length": 426.0, "epoch": 11.851470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.220535159111023, "kl": 0.009132401435635984, "learning_rate": 4.2596837346525304e-07, "loss": 9.141117334365845e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 439.8125, "completions/min_length": 395.0, "epoch": 11.852941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 1.2100197076797485, "kl": 0.00976050947792828, "learning_rate": 4.258414573723276e-07, "loss": 9.779632091522217e-05, "reward": 0.7437499761581421, "reward_std": 0.21286733448505402, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 441.125, "completions/min_length": 376.0, "epoch": 11.854411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.054360032081604, "kl": 0.009645757265388966, "learning_rate": 4.2571454616477935e-07, "loss": 9.613489964976907e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 417.1875, "completions/min_length": 365.0, "epoch": 11.855882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.8733702898025513, "kl": 0.008426692220382392, "learning_rate": 4.255876398509688e-07, "loss": 8.453141344944015e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 458.6875, "completions/min_length": 394.0, "epoch": 11.85735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9679259657859802, "kl": 0.011039269040338695, "learning_rate": 4.2546073843925623e-07, "loss": 0.00011031018948415294, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/mean_length": 565.5, "completions/min_length": 400.0, "epoch": 11.858823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.183869481086731, "kl": 0.009301848476752639, "learning_rate": 4.253338419380016e-07, "loss": 9.313972259406e-05, "reward": 0.5815972089767456, "reward_std": 0.011301686055958271, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8159722089767456, "rewards/DrugCombCoverageCOTORM/std": 0.24488072097301483, "step": 8064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 472.5, "completions/min_length": 425.0, "epoch": 11.860294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 1.1722183227539062, "kl": 0.014422339154407382, "learning_rate": 4.252069503555644e-07, "loss": 0.00014279689639806747, "reward": 0.7774583697319031, "reward_std": 0.14291277527809143, "rewards/DrugCombAccuracyCOTORM/mean": 0.7439583539962769, "rewards/DrugCombAccuracyCOTORM/std": 0.3552343547344208, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8229166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.22334784269332886, "step": 8065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 462.75, "completions/min_length": 410.0, "epoch": 11.861764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.021086500957608223, "kl": 0.01189970481209457, "learning_rate": 4.250800637003041e-07, "loss": 0.00011925442959181964, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 436.6875, "completions/min_length": 388.0, "epoch": 11.863235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.009019817225635052, "kl": 0.006936814519576728, "learning_rate": 4.2495318198057955e-07, "loss": 6.966499495320022e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 474.4375, "completions/min_length": 422.0, "epoch": 11.864705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 0.8878169655799866, "kl": 0.009326972882263362, "learning_rate": 4.248263052047495e-07, "loss": 9.37295553740114e-05, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 8068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/mean_length": 409.0625, "completions/min_length": 368.0, "epoch": 11.866176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.010007869452238083, "kl": 0.007788565242663026, "learning_rate": 4.2469943338117234e-07, "loss": 7.706751057412475e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 428.625, "completions/min_length": 370.0, "epoch": 11.867647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.014912193641066551, "kl": 0.008544405922293663, "learning_rate": 4.245725665182058e-07, "loss": 8.52062221383676e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 8070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/mean_length": 503.375, "completions/min_length": 436.0, "epoch": 11.869117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.373674988746643, "kl": 0.010281042195856571, "learning_rate": 4.244457046242077e-07, "loss": 0.00010212510824203491, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 453.0625, "completions/min_length": 406.0, "epoch": 11.870588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.7157415151596069, "kl": 0.009065191261470318, "learning_rate": 4.2431884770753533e-07, "loss": 9.12100076675415e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 462.8125, "completions/min_length": 398.0, "epoch": 11.87205882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.010113312862813473, "kl": 0.007530152564868331, "learning_rate": 4.2419199577654586e-07, "loss": 7.474442099919543e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 424.6875, "completions/min_length": 318.0, "epoch": 11.873529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.016083095222711563, "kl": 0.009209598298184574, "learning_rate": 4.2406514883959577e-07, "loss": 9.228195995092392e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 440.25, "completions/min_length": 362.0, "epoch": 11.875, "frac_reward_zero_std": 0.5, "grad_norm": 1.1436712741851807, "kl": 0.008990533067844808, "learning_rate": 4.2393830690504165e-07, "loss": 8.922815322875977e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 399.8125, "completions/min_length": 365.0, "epoch": 11.876470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.018578162416815758, "kl": 0.008119347738102078, "learning_rate": 4.238114699812393e-07, "loss": 8.120114216580987e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 429.125, "completions/min_length": 389.0, "epoch": 11.87794117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.046982571482658386, "kl": 0.011469765566289425, "learning_rate": 4.2368463807654444e-07, "loss": 0.00011503860878292471, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 466.625, "completions/min_length": 366.0, "epoch": 11.879411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.5320976972579956, "kl": 0.010529296938329935, "learning_rate": 4.235578111993125e-07, "loss": 0.00010569766163825989, "reward": 0.6470000147819519, "reward_std": 0.3146944046020508, "rewards/DrugCombAccuracyCOTORM/mean": 0.5900000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.4849192500114441, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.5374838709831238, "step": 8078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 471.8125, "completions/min_length": 424.0, "epoch": 11.880882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.731113851070404, "kl": 0.01051918976008892, "learning_rate": 4.2343098935789854e-07, "loss": 0.00010630488395690918, "reward": 0.45891666412353516, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.45125001668930054, "rewards/DrugCombAccuracyCOTORM/std": 0.502684473991394, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.020833313465118408, "rewards/DrugCombCoverageCOTORM/std": 1.0144785642623901, "step": 8079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/mean_length": 564.0, "completions/min_length": 466.0, "epoch": 11.882352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.1263573169708252, "kl": 0.010180425364524126, "learning_rate": 4.2330417256065723e-07, "loss": 0.00010056793689727783, "reward": 0.4974730610847473, "reward_std": 0.313975989818573, "rewards/DrugCombAccuracyCOTORM/mean": 0.4726225733757019, "rewards/DrugCombAccuracyCOTORM/std": 0.44450974464416504, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.1937500238418579, "rewards/DrugCombCoverageCOTORM/std": 0.9615742564201355, "step": 8080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/mean_length": 492.375, "completions/min_length": 429.0, "epoch": 11.883823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 1.0896270275115967, "kl": 0.012692534131929278, "learning_rate": 4.2317736081594304e-07, "loss": 0.00012733787298202515, "reward": 0.7540624737739563, "reward_std": 0.2548336684703827, "rewards/DrugCombAccuracyCOTORM/mean": 0.7023437023162842, "rewards/DrugCombAccuracyCOTORM/std": 0.3571828305721283, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.921875, "rewards/DrugCombCoverageCOTORM/std": 0.11967839300632477, "step": 8081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 480.3125, "completions/min_length": 434.0, "epoch": 11.885294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.7629762887954712, "kl": 0.010315247112885118, "learning_rate": 4.230505541321098e-07, "loss": 0.00010297990957042202, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 443.0625, "completions/min_length": 385.0, "epoch": 11.886764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.029493123292922974, "kl": 0.008110399707220495, "learning_rate": 4.229237525175113e-07, "loss": 8.202534809242934e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/mean_length": 510.4375, "completions/min_length": 405.0, "epoch": 11.888235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9456104040145874, "kl": 0.014060518704354763, "learning_rate": 4.227969559805009e-07, "loss": 0.0001412034034729004, "reward": 0.9322500228881836, "reward_std": 0.11678983271121979, "rewards/DrugCombAccuracyCOTORM/mean": 0.9179166555404663, "rewards/DrugCombAccuracyCOTORM/std": 0.20843996107578278, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 8084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 432.3125, "completions/min_length": 357.0, "epoch": 11.889705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.014374511316418648, "kl": 0.009596953983418643, "learning_rate": 4.226701645294317e-07, "loss": 9.498151484876871e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/mean_length": 487.4375, "completions/min_length": 395.0, "epoch": 11.891176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.001634120941162, "kl": 0.010164026403799653, "learning_rate": 4.225433781726564e-07, "loss": 0.00010146562271984294, "reward": 0.6400905251502991, "reward_std": 0.15212944149971008, "rewards/DrugCombAccuracyCOTORM/mean": 0.5930818915367126, "rewards/DrugCombAccuracyCOTORM/std": 0.47971466183662415, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.65625, "rewards/DrugCombCoverageCOTORM/std": 0.4366062581539154, "step": 8086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 417.8125, "completions/min_length": 377.0, "epoch": 11.89264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.1949012279510498, "kl": 0.009430766105651855, "learning_rate": 4.224165969185274e-07, "loss": 9.436160326004028e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/mean_length": 488.875, "completions/min_length": 363.0, "epoch": 11.894117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.1041042804718018, "kl": 0.011996502056717873, "learning_rate": 4.2228982077539655e-07, "loss": 0.00012181016791146249, "reward": 0.7171875238418579, "reward_std": 0.23422911763191223, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 8088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 410.875, "completions/min_length": 359.0, "epoch": 11.895588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.013978641480207443, "kl": 0.006502013187855482, "learning_rate": 4.2216304975161565e-07, "loss": 6.508975639007986e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/mean_length": 440.75, "completions/min_length": 311.0, "epoch": 11.897058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 0.8200721144676208, "kl": 0.01174793520476669, "learning_rate": 4.2203628385553605e-07, "loss": 0.00011840670049423352, "reward": 0.606669545173645, "reward_std": 0.052802328020334244, "rewards/DrugCombAccuracyCOTORM/mean": 0.5489619374275208, "rewards/DrugCombAccuracyCOTORM/std": 0.4692845344543457, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.675000011920929, "rewards/DrugCombCoverageCOTORM/std": 0.5026596188545227, "step": 8090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/mean_length": 487.3125, "completions/min_length": 360.0, "epoch": 11.898529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.7519215941429138, "kl": 0.0071986744878813624, "learning_rate": 4.2190952309550875e-07, "loss": 7.20451062079519e-05, "reward": 0.9052083492279053, "reward_std": 0.10225021839141846, "rewards/DrugCombAccuracyCOTORM/mean": 0.8854166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.2083333432674408, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 8091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 454.875, "completions/min_length": 362.0, "epoch": 11.9, "frac_reward_zero_std": 1.0, "grad_norm": 0.013615141622722149, "kl": 0.008691065129823983, "learning_rate": 4.2178276747988444e-07, "loss": 8.71472802828066e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 460.0, "completions/min_length": 425.0, "epoch": 11.901470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.9593790173530579, "kl": 0.011996695306152105, "learning_rate": 4.2165601701701365e-07, "loss": 0.00011873617768287659, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 8093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 520.3125, "completions/min_length": 476.0, "epoch": 11.902941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.2360714673995972, "kl": 0.012067266507074237, "learning_rate": 4.21529271715246e-07, "loss": 0.00012046098709106445, "reward": 0.875, "reward_std": 0.2449311763048172, "rewards/DrugCombAccuracyCOTORM/mean": 0.84375, "rewards/DrugCombAccuracyCOTORM/std": 0.3010398745536804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/mean_length": 503.6875, "completions/min_length": 439.0, "epoch": 11.904411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.8447682857513428, "kl": 0.00807344913482666, "learning_rate": 4.214025315829314e-07, "loss": 8.059797255555168e-05, "reward": 0.6919536590576172, "reward_std": 0.17272412776947021, "rewards/DrugCombAccuracyCOTORM/mean": 0.6583448648452759, "rewards/DrugCombAccuracyCOTORM/std": 0.435886412858963, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6527777910232544, "rewards/DrugCombCoverageCOTORM/std": 0.6835817098617554, "step": 8095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 431.25, "completions/min_length": 387.0, "epoch": 11.905882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.013962402008473873, "kl": 0.009064890095032752, "learning_rate": 4.2127579662841906e-07, "loss": 9.063517791219056e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 8096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 758.0, "completions/mean_length": 545.375, "completions/min_length": 445.0, "epoch": 11.907352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.8384634256362915, "kl": 0.0076283507514745, "learning_rate": 4.2114906686005807e-07, "loss": 7.726997137069702e-05, "reward": 0.7661246061325073, "reward_std": 0.111514613032341, "rewards/DrugCombAccuracyCOTORM/mean": 0.7180724143981934, "rewards/DrugCombAccuracyCOTORM/std": 0.3394508957862854, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.17083671689033508, "step": 8097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/mean_length": 408.5625, "completions/min_length": 368.0, "epoch": 11.908823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.00983304250985384, "kl": 0.006871881312690675, "learning_rate": 4.21022342286197e-07, "loss": 6.908434443175793e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 454.25, "completions/min_length": 414.0, "epoch": 11.910294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.024294789880514145, "kl": 0.013049207627773285, "learning_rate": 4.2089562291518423e-07, "loss": 0.00013091991422697902, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 433.1875, "completions/min_length": 372.0, "epoch": 11.911764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.012969334609806538, "kl": 0.010069468524307013, "learning_rate": 4.207689087553675e-07, "loss": 0.00010140203812625259, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 474.75, "completions/min_length": 411.0, "epoch": 11.913235294117648, "frac_reward_zero_std": 0.5, "grad_norm": 0.8374385833740234, "kl": 0.007600351935252547, "learning_rate": 4.2064219981509464e-07, "loss": 7.580220699310303e-05, "reward": 0.7124166488647461, "reward_std": 0.11620121449232101, "rewards/DrugCombAccuracyCOTORM/mean": 0.6587499976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.3996310830116272, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.17078250646591187, "step": 8101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 465.875, "completions/min_length": 386.0, "epoch": 11.91470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8591660857200623, "kl": 0.014193382812663913, "learning_rate": 4.2051549610271273e-07, "loss": 0.00014058285159990191, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 475.0, "completions/min_length": 421.0, "epoch": 11.916176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9719282984733582, "kl": 0.011483082780614495, "learning_rate": 4.2038879762656885e-07, "loss": 0.00011418537178542465, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 424.375, "completions/min_length": 379.0, "epoch": 11.91764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9572464227676392, "kl": 0.0074248951859772205, "learning_rate": 4.202621043950095e-07, "loss": 7.376846042461693e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 451.375, "completions/min_length": 383.0, "epoch": 11.919117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.7961628437042236, "kl": 0.009588470216840506, "learning_rate": 4.2013541641638106e-07, "loss": 9.620189666748047e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 456.0, "completions/min_length": 388.0, "epoch": 11.920588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.015509411692619324, "kl": 0.008474167669191957, "learning_rate": 4.200087336990291e-07, "loss": 8.546598837710917e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/mean_length": 474.875, "completions/min_length": 414.0, "epoch": 11.922058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.03559805080294609, "kl": 0.01312381459865719, "learning_rate": 4.1988205625129934e-07, "loss": 0.00012737710494548082, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 470.375, "completions/min_length": 406.0, "epoch": 11.923529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.06968356668949127, "kl": 0.01881116651929915, "learning_rate": 4.19755384081537e-07, "loss": 0.00018771989562083036, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/mean_length": 501.125, "completions/min_length": 413.0, "epoch": 11.925, "frac_reward_zero_std": 0.5, "grad_norm": 0.9502888321876526, "kl": 0.010104272281751037, "learning_rate": 4.196287171980869e-07, "loss": 0.00010041892528533936, "reward": 0.8376883864402771, "reward_std": 0.13978147506713867, "rewards/DrugCombAccuracyCOTORM/mean": 0.8114333748817444, "rewards/DrugCombAccuracyCOTORM/std": 0.292868971824646, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8854166269302368, "rewards/DrugCombCoverageCOTORM/std": 0.22541113197803497, "step": 8109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 477.125, "completions/min_length": 425.0, "epoch": 11.926470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 0.9975438714027405, "kl": 0.013130840728990734, "learning_rate": 4.1950205560929346e-07, "loss": 0.00013061687059234828, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 8110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 430.9375, "completions/min_length": 385.0, "epoch": 11.927941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.11893999576568604, "kl": 0.010214906302280724, "learning_rate": 4.19375399323501e-07, "loss": 0.00010310981451766565, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 425.75, "completions/min_length": 385.0, "epoch": 11.929411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.012942464090883732, "kl": 0.008481195196509361, "learning_rate": 4.1924874834905313e-07, "loss": 8.481161785311997e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 443.625, "completions/min_length": 408.0, "epoch": 11.930882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.011145476251840591, "kl": 0.007358202361501753, "learning_rate": 4.1912210269429333e-07, "loss": 7.351795647991821e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 436.5625, "completions/min_length": 396.0, "epoch": 11.93235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.044594168663025, "kl": 0.007903640274889767, "learning_rate": 4.1899546236756474e-07, "loss": 7.88569450378418e-05, "reward": 0.7352499961853027, "reward_std": 0.2242516130208969, "rewards/DrugCombAccuracyCOTORM/mean": 0.7081249952316284, "rewards/DrugCombAccuracyCOTORM/std": 0.4495474696159363, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.57373046875, "step": 8114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 453.875, "completions/min_length": 390.0, "epoch": 11.933823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.870710015296936, "kl": 0.00937342969700694, "learning_rate": 4.1886882737721005e-07, "loss": 9.29512971197255e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 461.6875, "completions/min_length": 400.0, "epoch": 11.935294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9978739619255066, "kl": 0.012528957100585103, "learning_rate": 4.1874219773157175e-07, "loss": 0.0001256316900253296, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 391.25, "completions/min_length": 340.0, "epoch": 11.936764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.007362066302448511, "kl": 0.006470749969594181, "learning_rate": 4.186155734389919e-07, "loss": 6.478492286987603e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 470.8125, "completions/min_length": 435.0, "epoch": 11.938235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.061238329857587814, "kl": 0.0121792983263731, "learning_rate": 4.1848895450781203e-07, "loss": 0.00012174980656709522, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 459.375, "completions/min_length": 384.0, "epoch": 11.939705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.8930584192276001, "kl": 0.012172401417046785, "learning_rate": 4.1836234094637357e-07, "loss": 0.00012209266424179077, "reward": 0.9078333377838135, "reward_std": 0.14855782687664032, "rewards/DrugCombAccuracyCOTORM/mean": 0.8899999856948853, "rewards/DrugCombAccuracyCOTORM/std": 0.266232967376709, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 8119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 444.375, "completions/min_length": 412.0, "epoch": 11.941176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.1463863849639893, "kl": 0.008503120858222246, "learning_rate": 4.1823573276301744e-07, "loss": 8.58604907989502e-05, "reward": 0.46272915601730347, "reward_std": 0.3295838236808777, "rewards/DrugCombAccuracyCOTORM/mean": 0.37203124165534973, "rewards/DrugCombAccuracyCOTORM/std": 0.38408035039901733, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6510416865348816, "rewards/DrugCombCoverageCOTORM/std": 0.46668529510498047, "step": 8120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 408.5, "completions/min_length": 354.0, "epoch": 11.94264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.029641738161444664, "kl": 0.00935452920384705, "learning_rate": 4.181091299660844e-07, "loss": 9.408695041202009e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 453.625, "completions/min_length": 365.0, "epoch": 11.944117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.8265768885612488, "kl": 0.007764876005239785, "learning_rate": 4.1798253256391453e-07, "loss": 7.803027256159112e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 467.4375, "completions/min_length": 423.0, "epoch": 11.945588235294117, "frac_reward_zero_std": 0.0, "grad_norm": 1.1628386974334717, "kl": 0.011958949849940836, "learning_rate": 4.17855940564848e-07, "loss": 0.00012072175741195679, "reward": 0.675000011920929, "reward_std": 0.4256991147994995, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 8123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 434.875, "completions/min_length": 374.0, "epoch": 11.947058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.934293270111084, "kl": 0.008274638908915222, "learning_rate": 4.1772935397722414e-07, "loss": 8.223811164498329e-05, "reward": 0.8812500238418579, "reward_std": 0.2202879637479782, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5439056158065796, "step": 8124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 476.0625, "completions/min_length": 444.0, "epoch": 11.948529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.01361712347716093, "kl": 0.008999698795378208, "learning_rate": 4.176027728093822e-07, "loss": 8.961286948760971e-05, "reward": 0.6713333129882812, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6100000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.40279027819633484, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 8125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/mean_length": 494.1875, "completions/min_length": 437.0, "epoch": 11.95, "frac_reward_zero_std": 1.0, "grad_norm": 0.01614277809858322, "kl": 0.010244367294944823, "learning_rate": 4.1747619706966113e-07, "loss": 0.0001028695551212877, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 8126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 434.5, "completions/min_length": 375.0, "epoch": 11.951470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.010352922603487968, "kl": 0.007942340453155339, "learning_rate": 4.173496267663993e-07, "loss": 7.951847510412335e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 437.75, "completions/min_length": 398.0, "epoch": 11.952941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.03147795796394348, "kl": 0.010649744304828346, "learning_rate": 4.1722306190793495e-07, "loss": 0.00010586461576167494, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 460.25, "completions/min_length": 398.0, "epoch": 11.954411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.03752007707953453, "kl": 0.008316683699376881, "learning_rate": 4.1709650250260586e-07, "loss": 8.393984171561897e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 469.5, "completions/min_length": 364.0, "epoch": 11.955882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.9547036290168762, "kl": 0.010641641216352582, "learning_rate": 4.169699485587493e-07, "loss": 0.00010563028627075255, "reward": 0.75, "reward_std": 0.17728105187416077, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4425306022167206, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 420.25, "completions/min_length": 380.0, "epoch": 11.95735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1196372509002686, "kl": 0.010818460141308606, "learning_rate": 4.1684340008470236e-07, "loss": 0.00010752677917480469, "reward": 0.942187488079071, "reward_std": 0.16351844370365143, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 454.125, "completions/min_length": 389.0, "epoch": 11.958823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.0864093080163002, "kl": 0.012494482100009918, "learning_rate": 4.1671685708880184e-07, "loss": 0.00012369542673695832, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/mean_length": 486.625, "completions/min_length": 383.0, "epoch": 11.96029411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.1029067039489746, "kl": 0.01206298591569066, "learning_rate": 4.16590319579384e-07, "loss": 0.0001212132119690068, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 8133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 414.6875, "completions/min_length": 355.0, "epoch": 11.961764705882352, "frac_reward_zero_std": 1.0, "grad_norm": 0.008456473238766193, "kl": 0.007165206712670624, "learning_rate": 4.1646378756478477e-07, "loss": 7.090918370522559e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 455.8125, "completions/min_length": 402.0, "epoch": 11.963235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1397154331207275, "kl": 0.008209695108234882, "learning_rate": 4.1633726105334e-07, "loss": 8.214210538426414e-05, "reward": 0.8500000238418579, "reward_std": 0.20701967179775238, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 464.75, "completions/min_length": 351.0, "epoch": 11.964705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.7560334801673889, "kl": 0.010074872057884932, "learning_rate": 4.162107400533846e-07, "loss": 9.96484377537854e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 443.1875, "completions/min_length": 354.0, "epoch": 11.966176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.3284854888916016, "kl": 0.011891782516613603, "learning_rate": 4.1608422457325356e-07, "loss": 0.00012033246457576752, "reward": 0.7416666746139526, "reward_std": 0.20266088843345642, "rewards/DrugCombAccuracyCOTORM/mean": 0.7083333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.4367387592792511, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 8137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 431.0, "completions/min_length": 355.0, "epoch": 11.967647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.013499833643436432, "kl": 0.008422416518442333, "learning_rate": 4.159577146212815e-07, "loss": 8.475784852635115e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 450.9375, "completions/min_length": 366.0, "epoch": 11.969117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.12685920298099518, "kl": 0.0128577989526093, "learning_rate": 4.1583121020580246e-07, "loss": 0.00012832082575187087, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 442.6875, "completions/min_length": 349.0, "epoch": 11.970588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.010232693515717983, "kl": 0.00823727494571358, "learning_rate": 4.1570471133515033e-07, "loss": 8.201124001061544e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 455.8125, "completions/min_length": 416.0, "epoch": 11.972058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.014110014773905277, "kl": 0.009666952537372708, "learning_rate": 4.1557821801765864e-07, "loss": 9.632945875637233e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 449.875, "completions/min_length": 412.0, "epoch": 11.973529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.008445358835160732, "kl": 0.0075471056625247, "learning_rate": 4.1545173026166013e-07, "loss": 7.560949597973377e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/mean_length": 502.6875, "completions/min_length": 413.0, "epoch": 11.975, "frac_reward_zero_std": 1.0, "grad_norm": 0.020321277901530266, "kl": 0.009239055099897087, "learning_rate": 4.153252480754877e-07, "loss": 9.407525067217648e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 8143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 430.9375, "completions/min_length": 385.0, "epoch": 11.976470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.6579510569572449, "kl": 0.007833644282072783, "learning_rate": 4.151987714674736e-07, "loss": 7.970289152581245e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 460.125, "completions/min_length": 426.0, "epoch": 11.977941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.01533882599323988, "kl": 0.00831830536480993, "learning_rate": 4.150723004459499e-07, "loss": 8.314871956827119e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 461.375, "completions/min_length": 416.0, "epoch": 11.979411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9699733257293701, "kl": 0.009410367580130696, "learning_rate": 4.14945835019248e-07, "loss": 9.431689977645874e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 463.3125, "completions/min_length": 404.0, "epoch": 11.980882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.758417546749115, "kl": 0.00698689476121217, "learning_rate": 4.148193751956994e-07, "loss": 6.971784023335204e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 441.875, "completions/min_length": 377.0, "epoch": 11.98235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.7773017883300781, "kl": 0.01834361965302378, "learning_rate": 4.1469292098363477e-07, "loss": 0.00016850251995492727, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 463.6875, "completions/min_length": 417.0, "epoch": 11.983823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.9847879409790039, "kl": 0.010203278856351972, "learning_rate": 4.1456647239138454e-07, "loss": 0.00010260939598083496, "reward": 0.30000001192092896, "reward_std": 0.20701967179775238, "rewards/DrugCombAccuracyCOTORM/mean": 0.1875, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 8149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 450.8125, "completions/min_length": 379.0, "epoch": 11.985294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.012065446935594082, "kl": 0.010683373780921102, "learning_rate": 4.1444002942727894e-07, "loss": 0.00010712218499975279, "reward": 0.8416666984558105, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.8333333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.17213258147239685, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 8150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 454.625, "completions/min_length": 370.0, "epoch": 11.986764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.650146484375, "kl": 0.010578446555882692, "learning_rate": 4.1431359209964767e-07, "loss": 0.00010764414037112147, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 505.375, "completions/min_length": 454.0, "epoch": 11.988235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9551379084587097, "kl": 0.010098573518916965, "learning_rate": 4.141871604168201e-07, "loss": 0.0001012349093798548, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 441.125, "completions/min_length": 383.0, "epoch": 11.989705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 0.6962639093399048, "kl": 0.0076832856284454465, "learning_rate": 4.1406073438712535e-07, "loss": 7.689744234085083e-05, "reward": 0.6603333353996277, "reward_std": 0.03111269697546959, "rewards/DrugCombAccuracyCOTORM/mean": 0.5962499976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.4203629493713379, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 8153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 431.125, "completions/min_length": 393.0, "epoch": 11.991176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.00957835465669632, "kl": 0.0075488927541300654, "learning_rate": 4.1393431401889183e-07, "loss": 7.566924614366144e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/mean_length": 474.3125, "completions/min_length": 382.0, "epoch": 11.992647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.9084432125091553, "kl": 0.011226360686123371, "learning_rate": 4.138078993204479e-07, "loss": 0.00011291354894638062, "reward": 0.6625000238418579, "reward_std": 0.2133909910917282, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 8155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 428.625, "completions/min_length": 374.0, "epoch": 11.994117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.015264968387782574, "kl": 0.009213027195073664, "learning_rate": 4.1368149030012144e-07, "loss": 9.220586798619479e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 447.625, "completions/min_length": 384.0, "epoch": 11.995588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.009658358059823513, "kl": 0.00913196918554604, "learning_rate": 4.1355508696623994e-07, "loss": 9.181065979646519e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/mean_length": 469.5, "completions/min_length": 361.0, "epoch": 11.99705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.013729842379689217, "kl": 0.00966998387593776, "learning_rate": 4.1342868932713056e-07, "loss": 9.724783012643456e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 463.0, "completions/min_length": 401.0, "epoch": 11.998529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.016884690150618553, "kl": 0.010368075803853571, "learning_rate": 4.133022973911201e-07, "loss": 0.00010349808871978894, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 486.4375, "completions/min_length": 414.0, "epoch": 12.0, "frac_reward_zero_std": 0.5, "grad_norm": 0.8425585031509399, "kl": 0.0069122189888730645, "learning_rate": 4.131759111665348e-07, "loss": 6.9115158112254e-05, "reward": 0.9802083373069763, "reward_std": 0.055979274213314056, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 8160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 468.4375, "completions/min_length": 441.0, "epoch": 12.001470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.0041837692260742, "kl": 0.01366165722720325, "learning_rate": 4.1304953066170076e-07, "loss": 0.00013498961925506592, "reward": 0.7875000238418579, "reward_std": 0.2295181304216385, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 449.0625, "completions/min_length": 355.0, "epoch": 12.00294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.0397696495056152, "kl": 0.01282852073200047, "learning_rate": 4.1292315588494363e-07, "loss": 0.0001282159792026505, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 438.625, "completions/min_length": 407.0, "epoch": 12.004411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.05417153984308243, "kl": 0.008889154181815684, "learning_rate": 4.1279678684458855e-07, "loss": 8.940044062910601e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 8163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/mean_length": 387.75, "completions/min_length": 355.0, "epoch": 12.005882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.03193346783518791, "kl": 0.010914316168054938, "learning_rate": 4.126704235489605e-07, "loss": 0.0001096587220672518, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/mean_length": 530.125, "completions/min_length": 435.0, "epoch": 12.007352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.0237772464752197, "kl": 0.00970125908497721, "learning_rate": 4.1254406600638413e-07, "loss": 9.684167162049562e-05, "reward": 0.9010000228881836, "reward_std": 0.08378955721855164, "rewards/DrugCombAccuracyCOTORM/mean": 0.8801562786102295, "rewards/DrugCombAccuracyCOTORM/std": 0.18698236346244812, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.06718549132347107, "step": 8165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 461.3125, "completions/min_length": 383.0, "epoch": 12.008823529411766, "frac_reward_zero_std": 0.0, "grad_norm": 1.414272427558899, "kl": 0.014568173792213202, "learning_rate": 4.124177142251832e-07, "loss": 0.00014799833297729492, "reward": 0.71875, "reward_std": 0.44291412830352783, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.6020797491073608, "step": 8166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/mean_length": 455.8125, "completions/min_length": 377.0, "epoch": 12.010294117647058, "frac_reward_zero_std": 0.0, "grad_norm": 1.5184556245803833, "kl": 0.012397081591188908, "learning_rate": 4.122913682136816e-07, "loss": 0.00012260675430297852, "reward": 0.44218748807907104, "reward_std": 0.22911673784255981, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 436.75, "completions/min_length": 343.0, "epoch": 12.011764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.015165694057941437, "kl": 0.009156559477560222, "learning_rate": 4.121650279802027e-07, "loss": 9.288496221415699e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 441.6875, "completions/min_length": 361.0, "epoch": 12.013235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.014141091145575047, "kl": 0.011182617163285613, "learning_rate": 4.120386935330695e-07, "loss": 0.00011206306226085871, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 452.875, "completions/min_length": 408.0, "epoch": 12.014705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 0.9684935212135315, "kl": 0.015489809680730104, "learning_rate": 4.1191236488060456e-07, "loss": 0.00015579164028167725, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 438.875, "completions/min_length": 348.0, "epoch": 12.016176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.820249080657959, "kl": 0.011746963020414114, "learning_rate": 4.1178604203113015e-07, "loss": 0.00011835699115181342, "reward": 0.949999988079071, "reward_std": 0.09258200973272324, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.17078252136707306, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/mean_length": 502.8125, "completions/min_length": 432.0, "epoch": 12.01764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0455403327941895, "kl": 0.007948426529765129, "learning_rate": 4.1165972499296796e-07, "loss": 7.894635200500488e-05, "reward": 0.9869999885559082, "reward_std": 0.03676954284310341, "rewards/DrugCombAccuracyCOTORM/mean": 0.9837499856948853, "rewards/DrugCombAccuracyCOTORM/std": 0.06499999761581421, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/mean_length": 510.375, "completions/min_length": 429.0, "epoch": 12.019117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 1.559271216392517, "kl": 0.01269003702327609, "learning_rate": 4.115334137744396e-07, "loss": 0.00012754276394844055, "reward": 0.7410833239555359, "reward_std": 0.37961670756340027, "rewards/DrugCombAccuracyCOTORM/mean": 0.7020238041877747, "rewards/DrugCombAccuracyCOTORM/std": 0.42156246304512024, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7946428656578064, "rewards/DrugCombCoverageCOTORM/std": 0.3536736071109772, "step": 8173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/mean_length": 496.1875, "completions/min_length": 390.0, "epoch": 12.020588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.7387986779212952, "kl": 0.008459553122520447, "learning_rate": 4.11407108383866e-07, "loss": 8.555435488233343e-05, "reward": 0.8250000476837158, "reward_std": 0.0707106739282608, "rewards/DrugCombAccuracyCOTORM/mean": 0.78125, "rewards/DrugCombAccuracyCOTORM/std": 0.2561737895011902, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 447.8125, "completions/min_length": 380.0, "epoch": 12.022058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.020209144800901413, "kl": 0.009451292688027024, "learning_rate": 4.1128080882956796e-07, "loss": 9.361532283946872e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/mean_length": 489.4375, "completions/min_length": 394.0, "epoch": 12.023529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.7746096849441528, "kl": 0.005682847986463457, "learning_rate": 4.1115451511986564e-07, "loss": 5.650569073623046e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 8176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 444.0625, "completions/min_length": 399.0, "epoch": 12.025, "frac_reward_zero_std": 1.0, "grad_norm": 0.01156681589782238, "kl": 0.009799827937968075, "learning_rate": 4.110282272630792e-07, "loss": 9.835899982135743e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 8177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 443.0, "completions/min_length": 412.0, "epoch": 12.026470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.210735559463501, "kl": 0.013640881516039371, "learning_rate": 4.1090194526752775e-07, "loss": 0.00013683849829249084, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/mean_length": 471.625, "completions/min_length": 410.0, "epoch": 12.027941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.7827478647232056, "kl": 0.01085804239846766, "learning_rate": 4.107756691415308e-07, "loss": 0.00010922551155090332, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/mean_length": 532.5, "completions/min_length": 407.0, "epoch": 12.029411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.7116611003875732, "kl": 0.009175415500067174, "learning_rate": 4.1064939889340686e-07, "loss": 9.139104804489762e-05, "reward": 0.9551249742507935, "reward_std": 0.12692566215991974, "rewards/DrugCombAccuracyCOTORM/mean": 0.9478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.20874999463558197, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 8180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 453.9375, "completions/min_length": 374.0, "epoch": 12.030882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.6804343461990356, "kl": 0.009642138262279332, "learning_rate": 4.105231345314745e-07, "loss": 9.672343730926514e-05, "reward": 0.44999998807907104, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 8181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 454.0625, "completions/min_length": 393.0, "epoch": 12.032352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.02568758837878704, "kl": 0.009719027439132333, "learning_rate": 4.103968760640516e-07, "loss": 9.613750444259495e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 444.5, "completions/min_length": 358.0, "epoch": 12.033823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.008695962838828564, "kl": 0.006323143607005477, "learning_rate": 4.102706234994558e-07, "loss": 6.285943527473137e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 407.875, "completions/min_length": 333.0, "epoch": 12.035294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.011589832603931427, "kl": 0.008538009598851204, "learning_rate": 4.1014437684600416e-07, "loss": 8.649492519907653e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/mean_length": 493.75, "completions/min_length": 414.0, "epoch": 12.036764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8903957605361938, "kl": 0.010303572984412313, "learning_rate": 4.1001813611201357e-07, "loss": 0.00010234427463728935, "reward": 0.606249988079071, "reward_std": 0.1635269820690155, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.7274384498596191, "step": 8185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/mean_length": 483.5, "completions/min_length": 398.0, "epoch": 12.038235294117648, "frac_reward_zero_std": 0.5, "grad_norm": 0.7884311079978943, "kl": 0.009516485151834786, "learning_rate": 4.098919013058005e-07, "loss": 9.553243580739945e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 404.4375, "completions/min_length": 368.0, "epoch": 12.03970588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.014346420764923096, "kl": 0.008458600845187902, "learning_rate": 4.097656724356809e-07, "loss": 8.430280286120251e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 444.125, "completions/min_length": 370.0, "epoch": 12.041176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.2192165851593018, "kl": 0.011312209302559495, "learning_rate": 4.0963944950997045e-07, "loss": 0.00011286756489425898, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 399.875, "completions/min_length": 357.0, "epoch": 12.04264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.008152980357408524, "kl": 0.0069797622272744775, "learning_rate": 4.0951323253698447e-07, "loss": 6.978643068578094e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 440.0, "completions/min_length": 360.0, "epoch": 12.044117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 1.3189270496368408, "kl": 0.012545809266157448, "learning_rate": 4.093870215250377e-07, "loss": 0.0001286529004573822, "reward": 0.581250011920929, "reward_std": 0.43991678953170776, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 8190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 426.1875, "completions/min_length": 369.0, "epoch": 12.045588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.014174310490489006, "kl": 0.008870325167663395, "learning_rate": 4.0926081648244455e-07, "loss": 8.953988435678184e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/mean_length": 544.875, "completions/min_length": 509.0, "epoch": 12.047058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.4142073392868042, "kl": 0.009477270883508027, "learning_rate": 4.091346174175192e-07, "loss": 9.454786777496338e-05, "reward": 0.7897031903266907, "reward_std": 0.14254407584667206, "rewards/DrugCombAccuracyCOTORM/mean": 0.7666428685188293, "rewards/DrugCombAccuracyCOTORM/std": 0.28136327862739563, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7638888955116272, "rewards/DrugCombCoverageCOTORM/std": 0.26566168665885925, "step": 8192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/mean_length": 476.5625, "completions/min_length": 404.0, "epoch": 12.048529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.5758382081985474, "kl": 0.011008851462975144, "learning_rate": 4.0900842433857536e-07, "loss": 0.00011005997657775879, "reward": 0.8999999761581421, "reward_std": 0.2104278802871704, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.2687419056892395, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 482.25, "completions/min_length": 432.0, "epoch": 12.05, "frac_reward_zero_std": 0.5, "grad_norm": 1.0975725650787354, "kl": 0.0131102807354182, "learning_rate": 4.0888223725392624e-07, "loss": 0.00013277643301989883, "reward": 0.987500011920929, "reward_std": 0.0353553406894207, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 446.0625, "completions/min_length": 365.0, "epoch": 12.051470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.011623714119195938, "kl": 0.006241157767362893, "learning_rate": 4.087560561718848e-07, "loss": 6.252606544876471e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 486.9375, "completions/min_length": 377.0, "epoch": 12.052941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.4868669509887695, "kl": 0.014697021339088678, "learning_rate": 4.0862988110076343e-07, "loss": 0.00014664232730865479, "reward": 0.8692708611488342, "reward_std": 0.2583695650100708, "rewards/DrugCombAccuracyCOTORM/mean": 0.8541666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.3435921370983124, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 8196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 453.0, "completions/min_length": 398.0, "epoch": 12.054411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.05687204375863075, "kl": 0.012130810995586216, "learning_rate": 4.085037120488742e-07, "loss": 0.0001180415929411538, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 422.875, "completions/min_length": 372.0, "epoch": 12.055882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.011161549016833305, "kl": 0.006120829842984676, "learning_rate": 4.0837754902452887e-07, "loss": 6.0887345171067864e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 446.9375, "completions/min_length": 380.0, "epoch": 12.05735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01874171756207943, "kl": 0.01053468557074666, "learning_rate": 4.0825139203603875e-07, "loss": 0.00010550255683483556, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 468.6875, "completions/min_length": 424.0, "epoch": 12.058823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.9901131987571716, "kl": 0.01132359413895756, "learning_rate": 4.0812524109171475e-07, "loss": 0.0001139320811489597, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/mean_length": 499.8125, "completions/min_length": 440.0, "epoch": 12.060294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.11020028591156, "kl": 0.01490444946102798, "learning_rate": 4.079990961998675e-07, "loss": 0.00014846860722173005, "reward": 0.6322083473205566, "reward_std": 0.15127438306808472, "rewards/DrugCombAccuracyCOTORM/mean": 0.5728124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.5018232464790344, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7395833730697632, "rewards/DrugCombCoverageCOTORM/std": 0.3546973168849945, "step": 8201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 461.0625, "completions/min_length": 418.0, "epoch": 12.061764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.01373886689543724, "kl": 0.008426801417954266, "learning_rate": 4.078729573688068e-07, "loss": 8.39091298985295e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 8202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/mean_length": 469.5, "completions/min_length": 397.0, "epoch": 12.063235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 1.1496456861495972, "kl": 0.01034809008706361, "learning_rate": 4.077468246068426e-07, "loss": 0.00010336657578591257, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 464.1875, "completions/min_length": 389.0, "epoch": 12.064705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9536718726158142, "kl": 0.0077716868836432695, "learning_rate": 4.076206979222841e-07, "loss": 7.728487253189087e-05, "reward": 0.10625000298023224, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.0625, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.6291528940200806, "step": 8204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 430.0625, "completions/min_length": 370.0, "epoch": 12.066176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.6473550200462341, "kl": 0.007300498080439866, "learning_rate": 4.0749457732344015e-07, "loss": 7.319450378417969e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/mean_length": 469.5625, "completions/min_length": 367.0, "epoch": 12.06764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 39.942447662353516, "kl": 0.020986970281228423, "learning_rate": 4.073684628186195e-07, "loss": 0.00020167281036265194, "reward": 0.872372567653656, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.8592156767845154, "rewards/DrugCombAccuracyCOTORM/std": 0.14540143311023712, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8500000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.1549193412065506, "step": 8206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 453.8125, "completions/min_length": 389.0, "epoch": 12.069117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.010355900973081589, "kl": 0.006097986130043864, "learning_rate": 4.072423544161301e-07, "loss": 6.143932841951028e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 440.375, "completions/min_length": 390.0, "epoch": 12.070588235294117, "frac_reward_zero_std": 0.0, "grad_norm": 1.7936111688613892, "kl": 0.010728549212217331, "learning_rate": 4.071162521242796e-07, "loss": 0.00010685622692108154, "reward": 0.550000011920929, "reward_std": 0.3265853822231293, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/mean_length": 396.4375, "completions/min_length": 332.0, "epoch": 12.072058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.5872424244880676, "kl": 0.020489668124355376, "learning_rate": 4.0699015595137534e-07, "loss": 0.00020051647152286023, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 447.0, "completions/min_length": 405.0, "epoch": 12.073529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 0.6491802930831909, "kl": 0.009442223235964775, "learning_rate": 4.0686406590572413e-07, "loss": 9.430944919586182e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 467.5625, "completions/min_length": 404.0, "epoch": 12.075, "frac_reward_zero_std": 0.5, "grad_norm": 0.8973551392555237, "kl": 0.010388591326773167, "learning_rate": 4.0673798199563263e-07, "loss": 0.00010400575411040336, "reward": 0.9706666469573975, "reward_std": 0.054983966052532196, "rewards/DrugCombAccuracyCOTORM/mean": 0.9633333683013916, "rewards/DrugCombAccuracyCOTORM/std": 0.10125143080949783, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 471.25, "completions/min_length": 416.0, "epoch": 12.076470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.116562843322754, "kl": 0.009795154444873333, "learning_rate": 4.066119042294069e-07, "loss": 9.814649820327759e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/mean_length": 518.4375, "completions/min_length": 424.0, "epoch": 12.077941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.5342657566070557, "kl": 0.012667891569435596, "learning_rate": 4.064858326153526e-07, "loss": 0.0001284666359424591, "reward": 0.6723666787147522, "reward_std": 0.25224798917770386, "rewards/DrugCombAccuracyCOTORM/mean": 0.6034791469573975, "rewards/DrugCombAccuracyCOTORM/std": 0.4322218596935272, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.18130187690258026, "step": 8213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 473.9375, "completions/min_length": 427.0, "epoch": 12.079411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.865453839302063, "kl": 0.008145926869474351, "learning_rate": 4.06359767161775e-07, "loss": 8.141249418258667e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 8214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/mean_length": 470.8125, "completions/min_length": 376.0, "epoch": 12.080882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.9730697870254517, "kl": 0.008804201381281018, "learning_rate": 4.0623370787697885e-07, "loss": 8.884543785825372e-05, "reward": 0.7622886896133423, "reward_std": 0.18636730313301086, "rewards/DrugCombAccuracyCOTORM/mean": 0.7087202072143555, "rewards/DrugCombAccuracyCOTORM/std": 0.43724554777145386, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.953125, "rewards/DrugCombCoverageCOTORM/std": 0.10527191311120987, "step": 8215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/mean_length": 513.1875, "completions/min_length": 385.0, "epoch": 12.08235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 7.938459873199463, "kl": 0.03511033486574888, "learning_rate": 4.0610765476926885e-07, "loss": 0.00035778991878032684, "reward": 0.7363451719284058, "reward_std": 0.12207003682851791, "rewards/DrugCombAccuracyCOTORM/mean": 0.6841033697128296, "rewards/DrugCombAccuracyCOTORM/std": 0.37208130955696106, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.2568235397338867, "step": 8216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 449.5, "completions/min_length": 398.0, "epoch": 12.083823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.847301721572876, "kl": 0.00856002198997885, "learning_rate": 4.0598160784694887e-07, "loss": 8.549315680284053e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/mean_length": 514.5, "completions/min_length": 435.0, "epoch": 12.08529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.898740828037262, "kl": 0.008811351843178272, "learning_rate": 4.0585556711832264e-07, "loss": 8.705093205207959e-05, "reward": 0.3490833640098572, "reward_std": 0.07884903252124786, "rewards/DrugCombAccuracyCOTORM/mean": 0.2775000035762787, "rewards/DrugCombAccuracyCOTORM/std": 0.31951701641082764, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.2708333432674408, "rewards/DrugCombCoverageCOTORM/std": 0.28463754057884216, "step": 8218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/mean_length": 409.375, "completions/min_length": 370.0, "epoch": 12.086764705882352, "frac_reward_zero_std": 1.0, "grad_norm": 0.01891549862921238, "kl": 0.008794018300250173, "learning_rate": 4.057295325916935e-07, "loss": 8.73599637998268e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 466.0, "completions/min_length": 397.0, "epoch": 12.088235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9440261125564575, "kl": 0.008109368151053786, "learning_rate": 4.056035042753641e-07, "loss": 8.054077625274658e-05, "reward": 0.949999988079071, "reward_std": 0.09258200973272324, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.17078252136707306, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 472.4375, "completions/min_length": 414.0, "epoch": 12.089705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.3297864198684692, "kl": 0.010889009106904268, "learning_rate": 4.054774821776369e-07, "loss": 0.0001103344839066267, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 436.125, "completions/min_length": 390.0, "epoch": 12.091176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.1395189762115479, "kl": 0.014093204634264112, "learning_rate": 4.0535146630681406e-07, "loss": 0.0001407712697982788, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 446.8125, "completions/min_length": 391.0, "epoch": 12.092647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.8194465041160583, "kl": 0.012223529862239957, "learning_rate": 4.05225456671197e-07, "loss": 0.0001225508749485016, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 457.4375, "completions/min_length": 394.0, "epoch": 12.094117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.9916600584983826, "kl": 0.013987506739795208, "learning_rate": 4.0509945327908705e-07, "loss": 0.00014081684639677405, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 8224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 455.875, "completions/min_length": 364.0, "epoch": 12.095588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.004632830619812, "kl": 0.010882551199756563, "learning_rate": 4.049734561387851e-07, "loss": 0.00010831556573975831, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 8225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 452.25, "completions/min_length": 359.0, "epoch": 12.097058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.0169818215072155, "kl": 0.010381639120168984, "learning_rate": 4.048474652585912e-07, "loss": 0.00010178133379667997, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 442.5625, "completions/min_length": 387.0, "epoch": 12.098529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.014652787707746029, "kl": 0.008334734593518078, "learning_rate": 4.0472148064680554e-07, "loss": 8.305658411700279e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 459.4375, "completions/min_length": 405.0, "epoch": 12.1, "frac_reward_zero_std": 1.0, "grad_norm": 0.025035139173269272, "kl": 0.011229382362216711, "learning_rate": 4.0459550231172755e-07, "loss": 0.00011237913713557646, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 444.9375, "completions/min_length": 389.0, "epoch": 12.101470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.014432898722589016, "kl": 0.01606003090273589, "learning_rate": 4.0446953026165653e-07, "loss": 0.00015847555187065154, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 429.0, "completions/min_length": 337.0, "epoch": 12.102941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 0.9727523922920227, "kl": 0.01039719779510051, "learning_rate": 4.04343564504891e-07, "loss": 0.0001043358352035284, "reward": 0.5839166641235352, "reward_std": 0.0458034947514534, "rewards/DrugCombAccuracyCOTORM/mean": 0.5137500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.5050000548362732, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7291666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.49018141627311707, "step": 8230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 477.3125, "completions/min_length": 401.0, "epoch": 12.104411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9291966557502747, "kl": 0.0123781762085855, "learning_rate": 4.042176050497295e-07, "loss": 0.00012287532445043325, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 438.375, "completions/min_length": 377.0, "epoch": 12.105882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.304320216178894, "kl": 0.013243745430372655, "learning_rate": 4.040916519044697e-07, "loss": 0.0001327068021055311, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 468.0625, "completions/min_length": 429.0, "epoch": 12.10735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.009038466960191727, "kl": 0.007882518344558775, "learning_rate": 4.039657050774092e-07, "loss": 7.899133925093338e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 514.375, "completions/min_length": 458.0, "epoch": 12.108823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.16115140914917, "kl": 0.010640871012583375, "learning_rate": 4.0383976457684493e-07, "loss": 0.00010711599315982312, "reward": 0.543749988079071, "reward_std": 0.029124118387699127, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.704154372215271, "step": 8234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 444.3125, "completions/min_length": 350.0, "epoch": 12.110294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.015088001266121864, "kl": 0.007635708665475249, "learning_rate": 4.0371383041107364e-07, "loss": 7.730630750302225e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/mean_length": 468.4375, "completions/min_length": 351.0, "epoch": 12.111764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 2.8771650791168213, "kl": 0.01044262747745961, "learning_rate": 4.035879025883916e-07, "loss": 0.00010383129119873047, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 8236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 415.8125, "completions/min_length": 367.0, "epoch": 12.113235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.007409404963254929, "kl": 0.006387082859873772, "learning_rate": 4.0346198111709455e-07, "loss": 6.372854113578796e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 8237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 431.1875, "completions/min_length": 389.0, "epoch": 12.114705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.0191291905939579, "kl": 0.00874356715939939, "learning_rate": 4.0333606600547785e-07, "loss": 8.774692832957953e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 420.1875, "completions/min_length": 370.0, "epoch": 12.116176470588234, "frac_reward_zero_std": 0.5, "grad_norm": 1.0062175989151, "kl": 0.011310576694086194, "learning_rate": 4.032101572618365e-07, "loss": 0.00011227341019548476, "reward": 0.7124166488647461, "reward_std": 0.11620121449232101, "rewards/DrugCombAccuracyCOTORM/mean": 0.6587499976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.3996310830116272, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.17078250646591187, "step": 8239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 444.625, "completions/min_length": 337.0, "epoch": 12.117647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.01973849907517433, "kl": 0.009037607349455357, "learning_rate": 4.0308425489446495e-07, "loss": 9.061531454790384e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 435.5, "completions/min_length": 402.0, "epoch": 12.119117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.9756577014923096, "kl": 0.010527858743444085, "learning_rate": 4.029583589116575e-07, "loss": 0.00010505318641662598, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 466.3125, "completions/min_length": 415.0, "epoch": 12.120588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.009291858412325382, "kl": 0.006883547292090952, "learning_rate": 4.028324693217078e-07, "loss": 6.860799476271495e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 452.1875, "completions/min_length": 376.0, "epoch": 12.12205882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.7224534153938293, "kl": 0.009798453305847943, "learning_rate": 4.027065861329092e-07, "loss": 9.846655302681029e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/mean_length": 465.375, "completions/min_length": 339.0, "epoch": 12.123529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.8605833649635315, "kl": 0.009270319947972894, "learning_rate": 4.025807093535544e-07, "loss": 9.255111217498779e-05, "reward": 0.8166666626930237, "reward_std": 0.18771813809871674, "rewards/DrugCombAccuracyCOTORM/mean": 0.7708333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.39849257469177246, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 447.625, "completions/min_length": 404.0, "epoch": 12.125, "frac_reward_zero_std": 0.5, "grad_norm": 0.670061469078064, "kl": 0.007678980473428965, "learning_rate": 4.0245483899193586e-07, "loss": 7.65595177654177e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 457.75, "completions/min_length": 406.0, "epoch": 12.126470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.016258958727121353, "kl": 0.009953831322491169, "learning_rate": 4.0232897505634576e-07, "loss": 9.967508958652616e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 459.25, "completions/min_length": 414.0, "epoch": 12.12794117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.009176268242299557, "kl": 0.006270276033319533, "learning_rate": 4.0220311755507553e-07, "loss": 6.289318844210356e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 8247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 448.3125, "completions/min_length": 380.0, "epoch": 12.129411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.011323797516524792, "kl": 0.010211771121248603, "learning_rate": 4.020772664964166e-07, "loss": 0.00010219831892754883, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 433.1875, "completions/min_length": 395.0, "epoch": 12.130882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.00854502897709608, "kl": 0.006064499844796956, "learning_rate": 4.0195142188865936e-07, "loss": 6.0682446928694844e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 442.5625, "completions/min_length": 388.0, "epoch": 12.132352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.012813202105462551, "kl": 0.007283783284947276, "learning_rate": 4.018255837400944e-07, "loss": 7.309066859306768e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 461.75, "completions/min_length": 376.0, "epoch": 12.133823529411766, "frac_reward_zero_std": 0.0, "grad_norm": 1.6234431266784668, "kl": 0.02068835264071822, "learning_rate": 4.0169975205901147e-07, "loss": 0.00021055340766906738, "reward": 0.637499988079071, "reward_std": 0.4153292179107666, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 8251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 371.5625, "completions/min_length": 346.0, "epoch": 12.135294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.00787889864295721, "kl": 0.006220410228706896, "learning_rate": 4.0157392685370017e-07, "loss": 6.180281343404204e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 478.125, "completions/min_length": 388.0, "epoch": 12.136764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.08615412563085556, "kl": 0.012432370567694306, "learning_rate": 4.0144810813244943e-07, "loss": 0.00012395769590511918, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/mean_length": 410.3125, "completions/min_length": 361.0, "epoch": 12.138235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01481232512742281, "kl": 0.008887737058103085, "learning_rate": 4.0132229590354805e-07, "loss": 8.833689207676798e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 466.375, "completions/min_length": 433.0, "epoch": 12.139705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.008386578410863876, "kl": 0.007614959613420069, "learning_rate": 4.0119649017528393e-07, "loss": 7.628365710843354e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/mean_length": 478.5, "completions/min_length": 345.0, "epoch": 12.141176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.6475334167480469, "kl": 0.006752469460479915, "learning_rate": 4.0107069095594507e-07, "loss": 6.733532791258767e-05, "reward": 0.9351190328598022, "reward_std": 0.07279587537050247, "rewards/DrugCombAccuracyCOTORM/mean": 0.9241071939468384, "rewards/DrugCombAccuracyCOTORM/std": 0.13162705302238464, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.1666666567325592, "step": 8256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 506.6875, "completions/min_length": 444.0, "epoch": 12.14264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8495638966560364, "kl": 0.010695458971895278, "learning_rate": 4.0094489825381874e-07, "loss": 0.00010599459346849471, "reward": 0.6079999804496765, "reward_std": 0.03527848422527313, "rewards/DrugCombAccuracyCOTORM/mean": 0.5412499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.47761037945747375, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 8257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 453.3125, "completions/min_length": 397.0, "epoch": 12.144117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.01334636565297842, "kl": 0.010384983383119106, "learning_rate": 4.0081911207719177e-07, "loss": 0.00010389171075075865, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/mean_length": 507.0625, "completions/min_length": 436.0, "epoch": 12.145588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.2921754121780396, "kl": 0.01807783404365182, "learning_rate": 4.0069333243435075e-07, "loss": 0.0001787915825843811, "reward": 0.4244166612625122, "reward_std": 0.28916871547698975, "rewards/DrugCombAccuracyCOTORM/mean": 0.30812501907348633, "rewards/DrugCombAccuracyCOTORM/std": 0.3501660227775574, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7791666388511658, "rewards/DrugCombCoverageCOTORM/std": 0.11979920417070389, "step": 8259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 462.8125, "completions/min_length": 391.0, "epoch": 12.147058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.04462505504488945, "kl": 0.008456336217932403, "learning_rate": 4.005675593335818e-07, "loss": 8.528592297807336e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 462.375, "completions/min_length": 356.0, "epoch": 12.148529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8621423244476318, "kl": 0.010100894840434194, "learning_rate": 4.0044179278317026e-07, "loss": 0.00010196518269367516, "reward": 0.7791666984558105, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.7708333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.26440009474754333, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 456.6875, "completions/min_length": 412.0, "epoch": 12.15, "frac_reward_zero_std": 0.5, "grad_norm": 60.43354415893555, "kl": 0.6258760720957071, "learning_rate": 4.0031603279140144e-07, "loss": 0.00580577552318573, "reward": 0.8500000238418579, "reward_std": 0.2070196568965912, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 445.3125, "completions/min_length": 394.0, "epoch": 12.151470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.02949543297290802, "kl": 0.011316094431094825, "learning_rate": 4.001902793665601e-07, "loss": 0.0001115787890739739, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 474.625, "completions/min_length": 410.0, "epoch": 12.152941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.4907035827636719, "kl": 0.01271569891832769, "learning_rate": 4.000645325169307e-07, "loss": 0.00012806802988052368, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 453.375, "completions/min_length": 389.0, "epoch": 12.154411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.021053694188594818, "kl": 0.00808600871823728, "learning_rate": 3.9993879225079685e-07, "loss": 8.131472714012489e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 433.5, "completions/min_length": 405.0, "epoch": 12.155882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.1605809926986694, "kl": 0.014489613939076662, "learning_rate": 3.998130585764423e-07, "loss": 0.00014528022438753396, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 464.5625, "completions/min_length": 400.0, "epoch": 12.157352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.020017093047499657, "kl": 0.008242625743150711, "learning_rate": 3.9968733150214983e-07, "loss": 8.287316450150684e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 453.5625, "completions/min_length": 395.0, "epoch": 12.158823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.4233393669128418, "kl": 0.01486281177494675, "learning_rate": 3.9956161103620206e-07, "loss": 0.00014759832993149757, "reward": 0.6153333187103271, "reward_std": 0.05438487231731415, "rewards/DrugCombAccuracyCOTORM/mean": 0.5504166483879089, "rewards/DrugCombAccuracyCOTORM/std": 0.47041648626327515, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.3333333432674408, "step": 8268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 453.125, "completions/min_length": 360.0, "epoch": 12.160294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.009325838647782803, "kl": 0.008258973131887615, "learning_rate": 3.994358971868812e-07, "loss": 8.316760795423761e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 462.3125, "completions/min_length": 411.0, "epoch": 12.161764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9391369223594666, "kl": 0.012066834606230259, "learning_rate": 3.993101899624689e-07, "loss": 0.00012082832836313173, "reward": 0.625, "reward_std": 0.15811389684677124, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 8270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 465.0625, "completions/min_length": 410.0, "epoch": 12.163235294117648, "frac_reward_zero_std": 0.5, "grad_norm": 0.8533374667167664, "kl": 0.008240362629294395, "learning_rate": 3.9918448937124653e-07, "loss": 8.213159162551165e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/mean_length": 536.0, "completions/min_length": 433.0, "epoch": 12.16470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8422584533691406, "kl": 0.008550823200494051, "learning_rate": 3.99058795421495e-07, "loss": 8.573487866669893e-05, "reward": 0.8394444584846497, "reward_std": 0.13342192769050598, "rewards/DrugCombAccuracyCOTORM/mean": 0.8062499761581421, "rewards/DrugCombAccuracyCOTORM/std": 0.2991887331008911, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9444444179534912, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 8272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 426.5, "completions/min_length": 363.0, "epoch": 12.166176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.00817008689045906, "kl": 0.007405854528769851, "learning_rate": 3.9893310812149437e-07, "loss": 7.409404497593641e-05, "reward": 0.6713333129882812, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6100000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.40279027819633484, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 8273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 485.875, "completions/min_length": 417.0, "epoch": 12.16764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.283704400062561, "kl": 0.009485557558946311, "learning_rate": 3.988074274795248e-07, "loss": 9.39778983592987e-05, "reward": 0.4645833671092987, "reward_std": 0.370425283908844, "rewards/DrugCombAccuracyCOTORM/mean": 0.3541666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.42979326844215393, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 8274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 439.75, "completions/min_length": 388.0, "epoch": 12.169117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.013822033070027828, "kl": 0.009739230736158788, "learning_rate": 3.986817535038659e-07, "loss": 9.718787623569369e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/mean_length": 474.0, "completions/min_length": 443.0, "epoch": 12.170588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.8927451372146606, "kl": 0.009348843828774989, "learning_rate": 3.9855608620279666e-07, "loss": 9.34302806854248e-05, "reward": 0.5076388716697693, "reward_std": 0.021606041118502617, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0763888955116272, "rewards/DrugCombCoverageCOTORM/std": 0.9985328912734985, "step": 8276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 503.25, "completions/min_length": 406.0, "epoch": 12.172058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.013317377306520939, "kl": 0.007709192228503525, "learning_rate": 3.9843042558459573e-07, "loss": 7.753431418677792e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 431.375, "completions/min_length": 390.0, "epoch": 12.173529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.012496506795287132, "kl": 0.007558987359516323, "learning_rate": 3.9830477165754147e-07, "loss": 7.596869545523077e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 443.1875, "completions/min_length": 364.0, "epoch": 12.175, "frac_reward_zero_std": 0.5, "grad_norm": 1.3812150955200195, "kl": 0.012466548942029476, "learning_rate": 3.981791244299113e-07, "loss": 0.00012480886653065681, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 8279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 427.1875, "completions/min_length": 368.0, "epoch": 12.176470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.02733525261282921, "kl": 0.01221372198779136, "learning_rate": 3.980534839099828e-07, "loss": 0.00012072362733306363, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 437.6875, "completions/min_length": 386.0, "epoch": 12.177941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.06893685460090637, "kl": 0.010834324406459928, "learning_rate": 3.979278501060328e-07, "loss": 0.00010958919301629066, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 417.9375, "completions/min_length": 387.0, "epoch": 12.179411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.01451778318732977, "kl": 0.0097076918464154, "learning_rate": 3.9780222302633764e-07, "loss": 9.703634714242071e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/mean_length": 525.875, "completions/min_length": 391.0, "epoch": 12.180882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.7156001925468445, "kl": 0.008454469847492874, "learning_rate": 3.9767660267917347e-07, "loss": 8.478015661239624e-05, "reward": 0.8073999881744385, "reward_std": 0.18567031621932983, "rewards/DrugCombAccuracyCOTORM/mean": 0.762374997138977, "rewards/DrugCombAccuracyCOTORM/std": 0.4015098214149475, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9750000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.06831300258636475, "step": 8283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 441.875, "completions/min_length": 406.0, "epoch": 12.18235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.0572814866900444, "kl": 0.009285696549341083, "learning_rate": 3.9755098907281587e-07, "loss": 9.395240340381861e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/mean_length": 531.3125, "completions/min_length": 469.0, "epoch": 12.183823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.9520644545555115, "kl": 0.008840816095471382, "learning_rate": 3.9742538221553967e-07, "loss": 8.649646042613313e-05, "reward": 0.9862916469573975, "reward_std": 0.03877301141619682, "rewards/DrugCombAccuracyCOTORM/mean": 0.98416668176651, "rewards/DrugCombAccuracyCOTORM/std": 0.06333333253860474, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9895833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.041666675359010696, "step": 8285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 451.1875, "completions/min_length": 385.0, "epoch": 12.185294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.8477767705917358, "kl": 0.010628810967318714, "learning_rate": 3.972997821156198e-07, "loss": 0.00010819733142852783, "reward": 0.6739000082015991, "reward_std": 0.1386614888906479, "rewards/DrugCombAccuracyCOTORM/mean": 0.5954999923706055, "rewards/DrugCombAccuracyCOTORM/std": 0.48065707087516785, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9750000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.06831300258636475, "step": 8286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 483.5625, "completions/min_length": 423.0, "epoch": 12.186764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 1.0658077001571655, "kl": 0.011818033875897527, "learning_rate": 3.971741887813303e-07, "loss": 0.0001177623780677095, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 436.5, "completions/min_length": 376.0, "epoch": 12.188235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 1.0047049522399902, "kl": 0.01128493738360703, "learning_rate": 3.9704860222094503e-07, "loss": 0.00011203425674466416, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 485.5625, "completions/min_length": 419.0, "epoch": 12.189705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.7107530236244202, "kl": 0.01167750172317028, "learning_rate": 3.969230224427374e-07, "loss": 0.00011722743511199951, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/mean_length": 544.5, "completions/min_length": 481.0, "epoch": 12.191176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.7258226275444031, "kl": 0.017221145681105554, "learning_rate": 3.967974494549802e-07, "loss": 0.00017340161139145494, "reward": 0.9816250205039978, "reward_std": 0.05197233706712723, "rewards/DrugCombAccuracyCOTORM/mean": 0.9775892496109009, "rewards/DrugCombAccuracyCOTORM/std": 0.08964285254478455, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9955357313156128, "rewards/DrugCombCoverageCOTORM/std": 0.017857149243354797, "step": 8290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 416.5, "completions/min_length": 382.0, "epoch": 12.19264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.009231767617166042, "kl": 0.006992431706748903, "learning_rate": 3.966718832659458e-07, "loss": 7.008501415839419e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 426.9375, "completions/min_length": 386.0, "epoch": 12.194117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.03959675878286362, "kl": 0.007842037477530539, "learning_rate": 3.9654632388390624e-07, "loss": 7.818997255526483e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/mean_length": 478.0, "completions/min_length": 385.0, "epoch": 12.195588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.7542597651481628, "kl": 0.009121718467213213, "learning_rate": 3.9642077131713304e-07, "loss": 9.07251305761747e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 431.0625, "completions/min_length": 396.0, "epoch": 12.197058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.014053760096430779, "kl": 0.009299765923060477, "learning_rate": 3.9629522557389734e-07, "loss": 9.306188439950347e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 438.875, "completions/min_length": 392.0, "epoch": 12.198529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 1.0774706602096558, "kl": 0.009038952528499067, "learning_rate": 3.9616968666246966e-07, "loss": 9.028199565364048e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 447.375, "completions/min_length": 393.0, "epoch": 12.2, "frac_reward_zero_std": 1.0, "grad_norm": 0.02776419185101986, "kl": 0.010917530744336545, "learning_rate": 3.960441545911204e-07, "loss": 0.00010907710384344682, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 455.875, "completions/min_length": 361.0, "epoch": 12.201470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.02639003098011017, "kl": 0.008631502860225737, "learning_rate": 3.959186293681191e-07, "loss": 8.564977906644344e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 428.375, "completions/min_length": 381.0, "epoch": 12.202941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.027672264724969864, "kl": 0.008512823143973947, "learning_rate": 3.95793111001735e-07, "loss": 8.560143760405481e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/mean_length": 497.0, "completions/min_length": 401.0, "epoch": 12.204411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.6867902278900146, "kl": 0.00685789983253926, "learning_rate": 3.956675995002371e-07, "loss": 6.867952470201999e-05, "reward": 0.736467182636261, "reward_std": 0.18556492030620575, "rewards/DrugCombAccuracyCOTORM/mean": 0.678396463394165, "rewards/DrugCombAccuracyCOTORM/std": 0.4536697566509247, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.13437095284461975, "step": 8299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 472.1875, "completions/min_length": 379.0, "epoch": 12.205882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 2.0543649196624756, "kl": 0.010222830576822162, "learning_rate": 3.955420948718937e-07, "loss": 0.00010180473327636719, "reward": 0.9479166269302368, "reward_std": 0.1473139226436615, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 8300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 424.5625, "completions/min_length": 385.0, "epoch": 12.20735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.013247293420135975, "kl": 0.00897444155998528, "learning_rate": 3.954165971249728e-07, "loss": 8.988857734948397e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 427.625, "completions/min_length": 371.0, "epoch": 12.208823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.016533689573407173, "kl": 0.010112924966961145, "learning_rate": 3.952911062677418e-07, "loss": 0.00010105008550453931, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 461.875, "completions/min_length": 384.0, "epoch": 12.21029411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9981150031089783, "kl": 0.012838696828112006, "learning_rate": 3.951656223084677e-07, "loss": 0.00012632764992304146, "reward": 0.9178333282470703, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.9025000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.26642072200775146, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 8303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 484.625, "completions/min_length": 392.0, "epoch": 12.211764705882352, "frac_reward_zero_std": 0.5, "grad_norm": 0.9380876421928406, "kl": 0.011327549465931952, "learning_rate": 3.950401452554171e-07, "loss": 0.00011317804455757141, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 8304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 465.0, "completions/min_length": 387.0, "epoch": 12.213235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.011912178248167038, "kl": 0.008952947333455086, "learning_rate": 3.9491467511685613e-07, "loss": 8.837305358611047e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 442.8125, "completions/min_length": 383.0, "epoch": 12.214705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.017613301053643227, "kl": 0.009823393076658249, "learning_rate": 3.9478921190105036e-07, "loss": 9.801580745261163e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 433.0, "completions/min_length": 371.0, "epoch": 12.216176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.2810161113739014, "kl": 0.011851441115140915, "learning_rate": 3.946637556162651e-07, "loss": 0.000118177333206404, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 450.4375, "completions/min_length": 358.0, "epoch": 12.217647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.9955503940582275, "kl": 0.010309185134246945, "learning_rate": 3.9453830627076513e-07, "loss": 0.00010270625352859497, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/mean_length": 469.875, "completions/min_length": 372.0, "epoch": 12.219117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.963675856590271, "kl": 0.009775644051842391, "learning_rate": 3.9441286387281457e-07, "loss": 9.80312324827537e-05, "reward": 0.9833333492279053, "reward_std": 0.047140445560216904, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 522.75, "completions/min_length": 463.0, "epoch": 12.220588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.3819324970245361, "kl": 0.010866221506148577, "learning_rate": 3.942874284306773e-07, "loss": 0.00010943412780761719, "reward": 0.6944208145141602, "reward_std": 0.28249701857566833, "rewards/DrugCombAccuracyCOTORM/mean": 0.6303958892822266, "rewards/DrugCombAccuracyCOTORM/std": 0.396904319524765, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9010416865348816, "rewards/DrugCombCoverageCOTORM/std": 0.1333984136581421, "step": 8310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 499.625, "completions/min_length": 433.0, "epoch": 12.222058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.8896557688713074, "kl": 0.008772724191658199, "learning_rate": 3.9416199995261674e-07, "loss": 8.781999349594116e-05, "reward": 0.6770833730697632, "reward_std": 0.16304093599319458, "rewards/DrugCombAccuracyCOTORM/mean": 0.6041666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.4901813864707947, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/mean_length": 534.5625, "completions/min_length": 462.0, "epoch": 12.223529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.6814638376235962, "kl": 0.009373311186209321, "learning_rate": 3.940365784468958e-07, "loss": 9.597837924957275e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 454.125, "completions/min_length": 418.0, "epoch": 12.225, "frac_reward_zero_std": 1.0, "grad_norm": 0.010857712477445602, "kl": 0.008878609631210566, "learning_rate": 3.939111639217769e-07, "loss": 8.802703814581037e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 462.25, "completions/min_length": 431.0, "epoch": 12.226470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.1927311420440674, "kl": 0.010952390730381012, "learning_rate": 3.937857563855221e-07, "loss": 0.00010938942432403564, "reward": 0.8374999761581421, "reward_std": 0.34973084926605225, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 8314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/mean_length": 528.3125, "completions/min_length": 464.0, "epoch": 12.227941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 0.9675214290618896, "kl": 0.009263657731935382, "learning_rate": 3.936603558463928e-07, "loss": 9.116437286138535e-05, "reward": 0.6124086976051331, "reward_std": 0.056954316794872284, "rewards/DrugCombAccuracyCOTORM/mean": 0.556135892868042, "rewards/DrugCombAccuracyCOTORM/std": 0.4628508388996124, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.675000011920929, "rewards/DrugCombCoverageCOTORM/std": 0.5026596188545227, "step": 8315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 500.375, "completions/min_length": 435.0, "epoch": 12.229411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.9483243227005005, "kl": 0.016321323928423226, "learning_rate": 3.9353496231265014e-07, "loss": 0.00016226587467826903, "reward": 0.6875, "reward_std": 0.19594095647335052, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 436.0, "completions/min_length": 360.0, "epoch": 12.230882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.8712523579597473, "kl": 0.008761014207266271, "learning_rate": 3.9340957579255477e-07, "loss": 8.744861406739801e-05, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 452.625, "completions/min_length": 390.0, "epoch": 12.23235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9776187539100647, "kl": 0.007258124649524689, "learning_rate": 3.932841962943668e-07, "loss": 7.278355769813061e-05, "reward": 0.887499988079071, "reward_std": 0.21001701056957245, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 495.0, "completions/min_length": 474.0, "epoch": 12.233823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.8810024857521057, "kl": 0.010849192040041089, "learning_rate": 3.9315882382634584e-07, "loss": 0.00010836124420166016, "reward": 0.6943333148956299, "reward_std": 0.19426637887954712, "rewards/DrugCombAccuracyCOTORM/mean": 0.6387500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.4844498634338379, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5018484592437744, "step": 8319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 510.0, "completions/min_length": 443.0, "epoch": 12.235294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.017329061403870583, "kl": 0.008917543338611722, "learning_rate": 3.930334583967514e-07, "loss": 9.014393435791135e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 441.8125, "completions/min_length": 366.0, "epoch": 12.236764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.009863640181720257, "kl": 0.006539634778164327, "learning_rate": 3.929081000138418e-07, "loss": 6.544509960804135e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 428.9375, "completions/min_length": 369.0, "epoch": 12.238235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.02029372751712799, "kl": 0.006569772260263562, "learning_rate": 3.9278274868587563e-07, "loss": 6.523107731482014e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 434.9375, "completions/min_length": 364.0, "epoch": 12.239705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.0223181247711182, "kl": 0.013494384940713644, "learning_rate": 3.926574044211106e-07, "loss": 0.00013431906700134277, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/mean_length": 395.75, "completions/min_length": 346.0, "epoch": 12.241176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.017008354887366295, "kl": 0.01005315757356584, "learning_rate": 3.925320672278042e-07, "loss": 0.00010053368168883026, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 451.4375, "completions/min_length": 400.0, "epoch": 12.242647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.0713800191879272, "kl": 0.011450625257566571, "learning_rate": 3.924067371142132e-07, "loss": 0.00011369904677849263, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 472.5625, "completions/min_length": 429.0, "epoch": 12.244117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.9076606631278992, "kl": 0.0070010918425396085, "learning_rate": 3.9228141408859415e-07, "loss": 6.952881813049316e-05, "reward": 0.6327500343322754, "reward_std": 0.023334523662924767, "rewards/DrugCombAccuracyCOTORM/mean": 0.5721874833106995, "rewards/DrugCombAccuracyCOTORM/std": 0.44363635778427124, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 8326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 420.1875, "completions/min_length": 384.0, "epoch": 12.245588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.009825494140386581, "kl": 0.006653242744505405, "learning_rate": 3.921560981592028e-07, "loss": 6.655208562733606e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 435.875, "completions/min_length": 354.0, "epoch": 12.24705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.00753179844468832, "kl": 0.0055772599298506975, "learning_rate": 3.920307893342949e-07, "loss": 5.4973523219814524e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 433.125, "completions/min_length": 365.0, "epoch": 12.248529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.9396305084228516, "kl": 0.011765082366764545, "learning_rate": 3.9190548762212527e-07, "loss": 0.00011625885963439941, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 423.0625, "completions/min_length": 364.0, "epoch": 12.25, "frac_reward_zero_std": 1.0, "grad_norm": 0.012802506797015667, "kl": 0.008508643135428429, "learning_rate": 3.9178019303094856e-07, "loss": 8.504737343173474e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 462.625, "completions/min_length": 405.0, "epoch": 12.251470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.6835149526596069, "kl": 0.008326415088959038, "learning_rate": 3.9165490556901894e-07, "loss": 8.254487329395488e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 8331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 490.8125, "completions/min_length": 445.0, "epoch": 12.25294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.445456624031067, "kl": 0.011417536530643702, "learning_rate": 3.9152962524459e-07, "loss": 0.00011505931615829468, "reward": 0.9102500081062317, "reward_std": 0.25385135412216187, "rewards/DrugCombAccuracyCOTORM/mean": 0.8956249952316284, "rewards/DrugCombAccuracyCOTORM/std": 0.28591302037239075, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.18130187690258026, "step": 8332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 402.25, "completions/min_length": 337.0, "epoch": 12.254411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.657799243927002, "kl": 0.012195211136713624, "learning_rate": 3.914043520659147e-07, "loss": 0.00012280791997909546, "reward": 0.5053333044052124, "reward_std": 0.18749845027923584, "rewards/DrugCombAccuracyCOTORM/mean": 0.4025000035762787, "rewards/DrugCombAccuracyCOTORM/std": 0.4833701252937317, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5018484592437744, "step": 8333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 463.75, "completions/min_length": 407.0, "epoch": 12.255882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.8999233245849609, "kl": 0.010538334492594004, "learning_rate": 3.912790860412459e-07, "loss": 0.00010569393634796143, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 454.9375, "completions/min_length": 374.0, "epoch": 12.257352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.9451316595077515, "kl": 0.013656607829034328, "learning_rate": 3.9115382717883583e-07, "loss": 0.00013576420315075666, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 8335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/mean_length": 510.375, "completions/min_length": 420.0, "epoch": 12.258823529411766, "frac_reward_zero_std": 0.0, "grad_norm": 1.3178569078445435, "kl": 0.01167560275644064, "learning_rate": 3.9102857548693615e-07, "loss": 0.00011687353253364563, "reward": 0.5437500476837158, "reward_std": 0.34269481897354126, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.4166666865348816, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 469.8125, "completions/min_length": 401.0, "epoch": 12.260294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 1.0264559984207153, "kl": 0.011443232418969274, "learning_rate": 3.9090333097379815e-07, "loss": 0.00011429027654230595, "reward": 0.762499988079071, "reward_std": 0.25599944591522217, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 8337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/mean_length": 410.3125, "completions/min_length": 350.0, "epoch": 12.261764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.02961190976202488, "kl": 0.008029342046938837, "learning_rate": 3.9077809364767265e-07, "loss": 8.117512334138155e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 472.5625, "completions/min_length": 412.0, "epoch": 12.263235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.008368859998881817, "kl": 0.006385263055562973, "learning_rate": 3.906528635168099e-07, "loss": 6.381457205861807e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/mean_length": 505.0625, "completions/min_length": 408.0, "epoch": 12.264705882352942, "frac_reward_zero_std": 0.0, "grad_norm": 1.695003867149353, "kl": 0.017777108354493976, "learning_rate": 3.9052764058945976e-07, "loss": 0.0001772865653038025, "reward": 0.7666666507720947, "reward_std": 0.27126747369766235, "rewards/DrugCombAccuracyCOTORM/mean": 0.7291666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.3095695972442627, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.2981424033641815, "step": 8340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 417.25, "completions/min_length": 356.0, "epoch": 12.266176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.008697908371686935, "kl": 0.0068715037778019905, "learning_rate": 3.904024248738716e-07, "loss": 6.858981214463711e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 406.5, "completions/min_length": 345.0, "epoch": 12.26764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.7224788069725037, "kl": 0.013141136849299073, "learning_rate": 3.902772163782943e-07, "loss": 0.00013137249334249645, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 413.5, "completions/min_length": 370.0, "epoch": 12.269117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.0981523990631104, "kl": 0.009304829873144627, "learning_rate": 3.901520151109764e-07, "loss": 9.31792746996507e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 425.375, "completions/min_length": 376.0, "epoch": 12.270588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.01429835893213749, "kl": 0.009320677258074284, "learning_rate": 3.900268210801658e-07, "loss": 9.225528629031032e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 8344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 453.9375, "completions/min_length": 402.0, "epoch": 12.272058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 0.649996817111969, "kl": 0.0074957957258448005, "learning_rate": 3.899016342941098e-07, "loss": 7.432699203491211e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 477.1875, "completions/min_length": 418.0, "epoch": 12.273529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.5369199514389038, "kl": 0.020523866172879934, "learning_rate": 3.897764547610556e-07, "loss": 0.00020931661128997803, "reward": 0.512499988079071, "reward_std": 0.0353553369641304, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 8346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 498.1875, "completions/min_length": 442.0, "epoch": 12.275, "frac_reward_zero_std": 0.5, "grad_norm": 0.9933555722236633, "kl": 0.014359503285959363, "learning_rate": 3.896512824892495e-07, "loss": 0.00014315439329948276, "reward": 0.7964791655540466, "reward_std": 0.16860923171043396, "rewards/DrugCombAccuracyCOTORM/mean": 0.7579687833786011, "rewards/DrugCombAccuracyCOTORM/std": 0.37081801891326904, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9010416865348816, "rewards/DrugCombCoverageCOTORM/std": 0.15280933678150177, "step": 8347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 443.6875, "completions/min_length": 400.0, "epoch": 12.276470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.010620868764817715, "kl": 0.008486462291330099, "learning_rate": 3.895261174869377e-07, "loss": 8.421459642704576e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 491.5, "completions/min_length": 373.0, "epoch": 12.277941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.8681463599205017, "kl": 0.008147358079440892, "learning_rate": 3.894009597623658e-07, "loss": 8.140318095684052e-05, "reward": 0.887499988079071, "reward_std": 0.21001701056957245, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 426.6875, "completions/min_length": 350.0, "epoch": 12.279411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.02236895263195038, "kl": 0.011098997900262475, "learning_rate": 3.8927580932377876e-07, "loss": 0.00011061012628488243, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 433.125, "completions/min_length": 370.0, "epoch": 12.280882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.010687325149774551, "kl": 0.007157606538385153, "learning_rate": 3.891506661794211e-07, "loss": 7.171191100496799e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 463.6875, "completions/min_length": 415.0, "epoch": 12.282352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.008735871873795986, "kl": 0.007439082022756338, "learning_rate": 3.8902553033753704e-07, "loss": 7.477353210560977e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 460.375, "completions/min_length": 415.0, "epoch": 12.283823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.015758885070681572, "kl": 0.008372119162231684, "learning_rate": 3.889004018063702e-07, "loss": 8.374905155505985e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 478.9375, "completions/min_length": 410.0, "epoch": 12.285294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.8754051923751831, "kl": 0.006751677603460848, "learning_rate": 3.8877528059416375e-07, "loss": 6.757216033292934e-05, "reward": 0.8812500238418579, "reward_std": 0.2202879637479782, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5439056158065796, "step": 8354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/mean_length": 494.625, "completions/min_length": 429.0, "epoch": 12.286764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.696967601776123, "kl": 0.013656720519065857, "learning_rate": 3.8865016670916033e-07, "loss": 0.000136643648147583, "reward": 0.7572708129882812, "reward_std": 0.31695860624313354, "rewards/DrugCombAccuracyCOTORM/mean": 0.7213281393051147, "rewards/DrugCombAccuracyCOTORM/std": 0.39168497920036316, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8020833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.3056187033653259, "step": 8355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 415.4375, "completions/min_length": 363.0, "epoch": 12.288235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.010873951017856598, "kl": 0.007223158609122038, "learning_rate": 3.8852506015960226e-07, "loss": 7.204653229564428e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 8356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/mean_length": 503.9375, "completions/min_length": 445.0, "epoch": 12.28970588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.011513601057231426, "kl": 0.009587859036400914, "learning_rate": 3.88399960953731e-07, "loss": 9.63644779403694e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 443.375, "completions/min_length": 404.0, "epoch": 12.291176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.0619165897369385, "kl": 0.011092016124166548, "learning_rate": 3.882748690997879e-07, "loss": 0.00011163577437400818, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/mean_length": 425.0, "completions/min_length": 408.0, "epoch": 12.29264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.06135820969939232, "kl": 0.008927105460315943, "learning_rate": 3.8814978460601364e-07, "loss": 8.938271639635786e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 407.5, "completions/min_length": 344.0, "epoch": 12.294117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.011665922589600086, "kl": 0.007832952891476452, "learning_rate": 3.880247074806485e-07, "loss": 7.766524504404515e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 426.875, "completions/min_length": 380.0, "epoch": 12.295588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.5327087640762329, "kl": 0.025977280805818737, "learning_rate": 3.8789963773193237e-07, "loss": 0.0002506473392713815, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 437.0625, "completions/min_length": 391.0, "epoch": 12.297058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.9488161206245422, "kl": 0.011339899501763284, "learning_rate": 3.8777457536810444e-07, "loss": 0.00011293590068817139, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 768.0, "completions/mean_length": 531.25, "completions/min_length": 458.0, "epoch": 12.298529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.036893606185913, "kl": 0.009444135124795139, "learning_rate": 3.876495203974034e-07, "loss": 9.417533874511719e-05, "reward": 0.7735416889190674, "reward_std": 0.19803889095783234, "rewards/DrugCombAccuracyCOTORM/mean": 0.7169270515441895, "rewards/DrugCombAccuracyCOTORM/std": 0.4470595121383667, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 457.75, "completions/min_length": 420.0, "epoch": 12.3, "frac_reward_zero_std": 1.0, "grad_norm": 0.007165936287492514, "kl": 0.007036763010546565, "learning_rate": 3.8752447282806755e-07, "loss": 7.098813512129709e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 8364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 458.625, "completions/min_length": 397.0, "epoch": 12.301470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 0.7230549454689026, "kl": 0.010890085715800524, "learning_rate": 3.873994326683349e-07, "loss": 0.00010944902896881104, "reward": 0.8614374995231628, "reward_std": 0.1148761510848999, "rewards/DrugCombAccuracyCOTORM/mean": 0.8463281393051147, "rewards/DrugCombAccuracyCOTORM/std": 0.2356000542640686, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.23935678601264954, "step": 8365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 473.375, "completions/min_length": 399.0, "epoch": 12.302941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9297018647193909, "kl": 0.01139864488504827, "learning_rate": 3.872743999264426e-07, "loss": 0.00011438876390457153, "reward": 0.6987500190734863, "reward_std": 0.156028151512146, "rewards/DrugCombAccuracyCOTORM/mean": 0.6507812738418579, "rewards/DrugCombAccuracyCOTORM/std": 0.430890291929245, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.78125, "rewards/DrugCombCoverageCOTORM/std": 0.36371922492980957, "step": 8366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 464.4375, "completions/min_length": 377.0, "epoch": 12.304411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.08096912503242493, "kl": 0.009830569615587592, "learning_rate": 3.8714937461062756e-07, "loss": 9.740929090185091e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 461.875, "completions/min_length": 401.0, "epoch": 12.305882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.0416990518569946, "kl": 0.008076993050053716, "learning_rate": 3.8702435672912623e-07, "loss": 8.106231689453125e-05, "reward": 0.921625018119812, "reward_std": 0.14512228965759277, "rewards/DrugCombAccuracyCOTORM/mean": 0.9059374928474426, "rewards/DrugCombAccuracyCOTORM/std": 0.25702768564224243, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.08539126068353653, "step": 8368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 501.8125, "completions/min_length": 413.0, "epoch": 12.30735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1445317268371582, "kl": 0.011744859395548701, "learning_rate": 3.8689934629017434e-07, "loss": 0.00011761486530303955, "reward": 0.8770833611488342, "reward_std": 0.1477174162864685, "rewards/DrugCombAccuracyCOTORM/mean": 0.8541666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.2713136672973633, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/mean_length": 460.125, "completions/min_length": 371.0, "epoch": 12.308823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.010356228798627853, "kl": 0.006925550871528685, "learning_rate": 3.8677434330200726e-07, "loss": 6.908953946549445e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 502.5625, "completions/min_length": 455.0, "epoch": 12.310294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.02131158672273159, "kl": 0.01156104844994843, "learning_rate": 3.866493477728599e-07, "loss": 0.00011586217442527413, "reward": 0.15000000596046448, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 8371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 463.3125, "completions/min_length": 411.0, "epoch": 12.311764705882354, "frac_reward_zero_std": 0.0, "grad_norm": 1.3044281005859375, "kl": 0.016199681675061584, "learning_rate": 3.8652435971096667e-07, "loss": 0.0001634061336517334, "reward": 0.8374999761581421, "reward_std": 0.34973084926605225, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 8372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 422.1875, "completions/min_length": 359.0, "epoch": 12.313235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.00746631994843483, "kl": 0.006749757216311991, "learning_rate": 3.863993791245614e-07, "loss": 6.733454210916534e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 423.8125, "completions/min_length": 368.0, "epoch": 12.314705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.0536726713180542, "kl": 0.012924988172017038, "learning_rate": 3.8627440602187767e-07, "loss": 0.00012474518734961748, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 463.5, "completions/min_length": 405.0, "epoch": 12.316176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.0691438913345337, "kl": 0.008695980883203447, "learning_rate": 3.8614944041114824e-07, "loss": 8.746336970943958e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 8375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 368.375, "completions/min_length": 329.0, "epoch": 12.31764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.010838394984602928, "kl": 0.007542721228674054, "learning_rate": 3.8602448230060545e-07, "loss": 7.540344086010009e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 465.125, "completions/min_length": 413.0, "epoch": 12.319117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.8802181482315063, "kl": 0.007639090996235609, "learning_rate": 3.858995316984814e-07, "loss": 7.634876237716526e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 481.125, "completions/min_length": 402.0, "epoch": 12.320588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.9908443689346313, "kl": 0.011090839048847556, "learning_rate": 3.857745886130074e-07, "loss": 0.00011187046766281128, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/mean_length": 467.875, "completions/min_length": 402.0, "epoch": 12.322058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.9752687215805054, "kl": 0.010806137346662581, "learning_rate": 3.856496530524145e-07, "loss": 0.00010694775119191036, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 8379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 421.75, "completions/min_length": 388.0, "epoch": 12.323529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.016530340537428856, "kl": 0.008850952610373497, "learning_rate": 3.8552472502493306e-07, "loss": 8.854747284203768e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 422.5625, "completions/min_length": 360.0, "epoch": 12.325, "frac_reward_zero_std": 1.0, "grad_norm": 0.014352696016430855, "kl": 0.009225109359249473, "learning_rate": 3.85399804538793e-07, "loss": 9.248769492842257e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/mean_length": 440.3125, "completions/min_length": 413.0, "epoch": 12.326470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.05144017934799194, "kl": 0.009535112767480314, "learning_rate": 3.852748916022238e-07, "loss": 9.530578972771764e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 455.6875, "completions/min_length": 375.0, "epoch": 12.327941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.027154000476002693, "kl": 0.00948783801868558, "learning_rate": 3.851499862234544e-07, "loss": 9.467584459343925e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/mean_length": 490.9375, "completions/min_length": 448.0, "epoch": 12.329411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.7480579614639282, "kl": 0.00962395197711885, "learning_rate": 3.850250884107133e-07, "loss": 9.69284592429176e-05, "reward": 0.625, "reward_std": 0.070710688829422, "rewards/DrugCombAccuracyCOTORM/mean": 0.53125, "rewards/DrugCombAccuracyCOTORM/std": 0.4989572763442993, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 408.25, "completions/min_length": 359.0, "epoch": 12.330882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.4161359071731567, "kl": 0.017579776234924793, "learning_rate": 3.849001981722284e-07, "loss": 0.0001763588807079941, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 439.9375, "completions/min_length": 370.0, "epoch": 12.33235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9503874778747559, "kl": 0.012056036619469523, "learning_rate": 3.8477531551622723e-07, "loss": 0.00012115822028135881, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 416.125, "completions/min_length": 368.0, "epoch": 12.333823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.016903366893529892, "kl": 0.009343684301711619, "learning_rate": 3.8465044045093657e-07, "loss": 9.33248084038496e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 443.4375, "completions/min_length": 401.0, "epoch": 12.33529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.01268160529434681, "kl": 0.008267571218311787, "learning_rate": 3.84525572984583e-07, "loss": 8.259947935584933e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 434.0625, "completions/min_length": 395.0, "epoch": 12.336764705882352, "frac_reward_zero_std": 0.0, "grad_norm": 1.3223011493682861, "kl": 0.011932400055229664, "learning_rate": 3.8440071312539245e-07, "loss": 0.00012053549289703369, "reward": 0.8500000238418579, "reward_std": 0.3265853524208069, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 478.6875, "completions/min_length": 440.0, "epoch": 12.338235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.0714526176452637, "kl": 0.008627726463600993, "learning_rate": 3.8427586088159037e-07, "loss": 8.61138105392456e-05, "reward": 0.8089166879653931, "reward_std": 0.3232209086418152, "rewards/DrugCombAccuracyCOTORM/mean": 0.7637500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.42547035217285156, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 8390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 428.6875, "completions/min_length": 377.0, "epoch": 12.339705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.018553633242845535, "kl": 0.00887494022026658, "learning_rate": 3.841510162614017e-07, "loss": 8.73950484674424e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 446.125, "completions/min_length": 397.0, "epoch": 12.341176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.04053401201963425, "kl": 0.011318808421492577, "learning_rate": 3.84026179273051e-07, "loss": 0.00011408279533497989, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 8392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 477.75, "completions/min_length": 401.0, "epoch": 12.342647058823529, "frac_reward_zero_std": 0.0, "grad_norm": 1.3599343299865723, "kl": 0.011597767472267151, "learning_rate": 3.839013499247621e-07, "loss": 0.0001179128885269165, "reward": 0.7589166760444641, "reward_std": 0.33001023530960083, "rewards/DrugCombAccuracyCOTORM/mean": 0.7012500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.46046173572540283, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 8393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 477.3125, "completions/min_length": 404.0, "epoch": 12.344117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.019583990797400475, "kl": 0.009089392726309597, "learning_rate": 3.8377652822475836e-07, "loss": 9.14120755624026e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 436.875, "completions/min_length": 392.0, "epoch": 12.345588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.010347919538617134, "kl": 0.0059305825270712376, "learning_rate": 3.836517141812629e-07, "loss": 5.940430492046289e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/mean_length": 470.875, "completions/min_length": 378.0, "epoch": 12.347058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.9120568633079529, "kl": 0.009047452826052904, "learning_rate": 3.8352690780249796e-07, "loss": 9.040179429575801e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 780.0, "completions/mean_length": 534.6875, "completions/min_length": 332.0, "epoch": 12.348529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.9903883934020996, "kl": 0.010508321691304445, "learning_rate": 3.8340210909668567e-07, "loss": 0.0001055300235748291, "reward": 0.5803472399711609, "reward_std": 0.007387829478830099, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8034722208976746, "rewards/DrugCombCoverageCOTORM/std": 0.2266855090856552, "step": 8397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 493.5, "completions/min_length": 436.0, "epoch": 12.35, "frac_reward_zero_std": 0.5, "grad_norm": 1.0037912130355835, "kl": 0.00829248537775129, "learning_rate": 3.8327731807204744e-07, "loss": 8.32340374472551e-05, "reward": 0.9178333282470703, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.9025000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.26642072200775146, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 8398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 396.625, "completions/min_length": 297.0, "epoch": 12.351470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.3145933151245117, "kl": 0.011911323992535472, "learning_rate": 3.8315253473680396e-07, "loss": 0.00011809170246124268, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 426.5, "completions/min_length": 366.0, "epoch": 12.352941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 0.9377368092536926, "kl": 0.009061546763405204, "learning_rate": 3.8302775909917584e-07, "loss": 9.089354716707021e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 467.625, "completions/min_length": 419.0, "epoch": 12.354411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.009641180746257305, "kl": 0.008269394165836275, "learning_rate": 3.829029911673829e-07, "loss": 8.215248089982197e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 474.375, "completions/min_length": 425.0, "epoch": 12.355882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 1.6023445129394531, "kl": 0.00997615628875792, "learning_rate": 3.8277823094964456e-07, "loss": 9.915977716445923e-05, "reward": 0.874833345413208, "reward_std": 0.244136244058609, "rewards/DrugCombAccuracyCOTORM/mean": 0.85916668176651, "rewards/DrugCombAccuracyCOTORM/std": 0.34125587344169617, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 8402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/mean_length": 506.125, "completions/min_length": 388.0, "epoch": 12.35735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.284074068069458, "kl": 0.010640420950949192, "learning_rate": 3.826534784541797e-07, "loss": 0.00010633468627929688, "reward": 0.8250000476837158, "reward_std": 0.0707106739282608, "rewards/DrugCombAccuracyCOTORM/mean": 0.78125, "rewards/DrugCombAccuracyCOTORM/std": 0.2561737895011902, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 457.75, "completions/min_length": 380.0, "epoch": 12.358823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.9876842498779297, "kl": 0.009283186751417816, "learning_rate": 3.825287336892069e-07, "loss": 9.198486804962158e-05, "reward": 0.6919916868209839, "reward_std": 0.12494400888681412, "rewards/DrugCombAccuracyCOTORM/mean": 0.6397291421890259, "rewards/DrugCombAccuracyCOTORM/std": 0.422089546918869, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8020833730697632, "rewards/DrugCombCoverageCOTORM/std": 0.23741470277309418, "step": 8404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 406.375, "completions/min_length": 348.0, "epoch": 12.360294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.0168903935700655, "kl": 0.010008455254137516, "learning_rate": 3.8240399666294365e-07, "loss": 0.0001004938967525959, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 445.3125, "completions/min_length": 368.0, "epoch": 12.361764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8036919236183167, "kl": 0.007498098071664572, "learning_rate": 3.8227926738360756e-07, "loss": 7.471069693565369e-05, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 8406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 482.125, "completions/min_length": 429.0, "epoch": 12.363235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0790092945098877, "kl": 0.010330722550861537, "learning_rate": 3.8215454585941544e-07, "loss": 0.00010375678539276123, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 428.25, "completions/min_length": 398.0, "epoch": 12.364705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.029524369165301323, "kl": 0.00856396195013076, "learning_rate": 3.8202983209858353e-07, "loss": 8.56244150782004e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 414.875, "completions/min_length": 372.0, "epoch": 12.366176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.007782583124935627, "kl": 0.00685516616795212, "learning_rate": 3.8190512610932777e-07, "loss": 6.823999137850478e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 417.1875, "completions/min_length": 363.0, "epoch": 12.367647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.018151864409446716, "kl": 0.008640856249257922, "learning_rate": 3.8178042789986354e-07, "loss": 8.623028406873345e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 491.3125, "completions/min_length": 434.0, "epoch": 12.369117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.013829076662659645, "kl": 0.00913142180070281, "learning_rate": 3.8165573747840546e-07, "loss": 9.101783507503569e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 450.1875, "completions/min_length": 389.0, "epoch": 12.370588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9260502457618713, "kl": 0.008379947277717292, "learning_rate": 3.8153105485316786e-07, "loss": 8.320710912812501e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 520.5, "completions/min_length": 463.0, "epoch": 12.37205882352941, "frac_reward_zero_std": 0.0, "grad_norm": 1.121800422668457, "kl": 0.008509967825375497, "learning_rate": 3.8140638003236457e-07, "loss": 8.540600538253784e-05, "reward": 0.512499988079071, "reward_std": 0.318198025226593, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.125, "rewards/DrugCombCoverageCOTORM/std": 1.0246951580047607, "step": 8413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 450.8125, "completions/min_length": 425.0, "epoch": 12.373529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.8387326598167419, "kl": 0.008087995811365545, "learning_rate": 3.812817130242089e-07, "loss": 8.068549504969269e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/mean_length": 534.125, "completions/min_length": 461.0, "epoch": 12.375, "frac_reward_zero_std": 0.0, "grad_norm": 1.5286504030227661, "kl": 0.009184791008010507, "learning_rate": 3.811570538369135e-07, "loss": 9.209662675857544e-05, "reward": 0.7102553844451904, "reward_std": 0.2499474287033081, "rewards/DrugCombAccuracyCOTORM/mean": 0.6508400440216064, "rewards/DrugCombAccuracyCOTORM/std": 0.38301846385002136, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.19364917278289795, "step": 8415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/mean_length": 492.5625, "completions/min_length": 414.0, "epoch": 12.376470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.7294556498527527, "kl": 0.006136677227914333, "learning_rate": 3.810324024786907e-07, "loss": 6.15843600826338e-05, "reward": 0.9178333282470703, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.9025000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.26642072200775146, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 8416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 474.0, "completions/min_length": 371.0, "epoch": 12.37794117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.7384811639785767, "kl": 0.014609565725550056, "learning_rate": 3.809077589577521e-07, "loss": 0.00014748051762580872, "reward": 0.7229166626930237, "reward_std": 0.38668400049209595, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7291666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.6800735592842102, "step": 8417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 456.625, "completions/min_length": 389.0, "epoch": 12.379411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.8911200761795044, "kl": 0.009423056268133223, "learning_rate": 3.80783123282309e-07, "loss": 9.449270874029025e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 461.3125, "completions/min_length": 415.0, "epoch": 12.380882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.027352197095751762, "kl": 0.011193026322871447, "learning_rate": 3.8065849546057196e-07, "loss": 0.0001127119903685525, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 429.3125, "completions/min_length": 387.0, "epoch": 12.382352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.013466140255331993, "kl": 0.007822957122698426, "learning_rate": 3.805338755007512e-07, "loss": 7.871417619753629e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 446.4375, "completions/min_length": 411.0, "epoch": 12.383823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.00953004788607359, "kl": 0.007777873775921762, "learning_rate": 3.804092634110565e-07, "loss": 7.758866559015587e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/mean_length": 534.125, "completions/min_length": 490.0, "epoch": 12.385294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.9835291504859924, "kl": 0.008641995373181999, "learning_rate": 3.802846591996969e-07, "loss": 8.566891483496875e-05, "reward": 0.8979166746139526, "reward_std": 0.17597517371177673, "rewards/DrugCombAccuracyCOTORM/mean": 0.8958333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.26440009474754333, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 8422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 467.6875, "completions/min_length": 411.0, "epoch": 12.386764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.0135330930352211, "kl": 0.011360001983121037, "learning_rate": 3.801600628748809e-07, "loss": 0.00011323871149215847, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 409.875, "completions/min_length": 335.0, "epoch": 12.388235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8760513663291931, "kl": 0.007642718032002449, "learning_rate": 3.800354744448167e-07, "loss": 7.608703890582547e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/mean_length": 519.8125, "completions/min_length": 469.0, "epoch": 12.389705882352942, "frac_reward_zero_std": 0.0, "grad_norm": 1.5631921291351318, "kl": 0.016921117203310132, "learning_rate": 3.799108939177118e-07, "loss": 0.0001671314239501953, "reward": 0.7041666507720947, "reward_std": 0.31625911593437195, "rewards/DrugCombAccuracyCOTORM/mean": 0.6666666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.42163705825805664, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7083333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.4281744360923767, "step": 8425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 425.625, "completions/min_length": 380.0, "epoch": 12.391176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.02022579126060009, "kl": 0.009093534084968269, "learning_rate": 3.7978632130177336e-07, "loss": 9.054300608113408e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 504.9375, "completions/min_length": 460.0, "epoch": 12.39264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0008466243743896, "kl": 0.010271511506289244, "learning_rate": 3.796617566052078e-07, "loss": 0.0001034662127494812, "reward": 0.8633333444595337, "reward_std": 0.11313710361719131, "rewards/DrugCombAccuracyCOTORM/mean": 0.8500000238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.24765567481517792, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 8427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 445.0625, "completions/min_length": 375.0, "epoch": 12.394117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.031162748113274574, "kl": 0.009406392695382237, "learning_rate": 3.7953719983622134e-07, "loss": 9.420886635780334e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/mean_length": 504.25, "completions/min_length": 407.0, "epoch": 12.395588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.8484415411949158, "kl": 0.009874083334580064, "learning_rate": 3.7941265100301916e-07, "loss": 9.85860824584961e-05, "reward": 0.9513333439826965, "reward_std": 0.13765011727809906, "rewards/DrugCombAccuracyCOTORM/mean": 0.9443750381469727, "rewards/DrugCombAccuracyCOTORM/std": 0.2224999964237213, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.1666666567325592, "step": 8429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 449.9375, "completions/min_length": 415.0, "epoch": 12.397058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.0113636814057827, "kl": 0.00792694219853729, "learning_rate": 3.792881101138064e-07, "loss": 7.897099567344412e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 473.625, "completions/min_length": 422.0, "epoch": 12.398529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9129075407981873, "kl": 0.009146512253209949, "learning_rate": 3.7916357717678744e-07, "loss": 9.141609189100564e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/mean_length": 399.25, "completions/min_length": 362.0, "epoch": 12.4, "frac_reward_zero_std": 0.0, "grad_norm": 1.3640882968902588, "kl": 0.011399032780900598, "learning_rate": 3.790390522001662e-07, "loss": 0.0001138448715209961, "reward": 0.5820833444595337, "reward_std": 0.2046433538198471, "rewards/DrugCombAccuracyCOTORM/mean": 0.5062500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.4576297700405121, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7708333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.26440009474754333, "step": 8432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/mean_length": 493.0, "completions/min_length": 414.0, "epoch": 12.401470588235295, "frac_reward_zero_std": 0.0, "grad_norm": 1.4203333854675293, "kl": 0.010977861005812883, "learning_rate": 3.789145351921461e-07, "loss": 0.00010767579078674316, "reward": 0.8441874980926514, "reward_std": 0.33252325654029846, "rewards/DrugCombAccuracyCOTORM/mean": 0.8228124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.38252657651901245, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.859375, "rewards/DrugCombCoverageCOTORM/std": 0.49973952770233154, "step": 8433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 464.9375, "completions/min_length": 378.0, "epoch": 12.402941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.136048436164856, "kl": 0.009049312444403768, "learning_rate": 3.7879002616093007e-07, "loss": 8.99160368135199e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 8434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 435.1875, "completions/min_length": 399.0, "epoch": 12.404411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.020837705582380295, "kl": 0.00989052397198975, "learning_rate": 3.7866552511472035e-07, "loss": 9.868282359093428e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 452.125, "completions/min_length": 416.0, "epoch": 12.405882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.009939881041646004, "kl": 0.007108025485649705, "learning_rate": 3.7854103206171867e-07, "loss": 7.099497452145442e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 455.75, "completions/min_length": 387.0, "epoch": 12.407352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.01761562190949917, "kl": 0.010960059706121683, "learning_rate": 3.784165470101264e-07, "loss": 0.00011000219819834456, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 446.6875, "completions/min_length": 363.0, "epoch": 12.408823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.0791221857070923, "kl": 0.011592476163059473, "learning_rate": 3.782920699681444e-07, "loss": 0.00011639681179076433, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 475.8125, "completions/min_length": 436.0, "epoch": 12.410294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.007992039434611797, "kl": 0.00783063133712858, "learning_rate": 3.7816760094397283e-07, "loss": 7.84805160947144e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 485.625, "completions/min_length": 396.0, "epoch": 12.411764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.1006860733032227, "kl": 0.009823622414842248, "learning_rate": 3.780431399458114e-07, "loss": 9.723984112497419e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/mean_length": 471.4375, "completions/min_length": 414.0, "epoch": 12.413235294117648, "frac_reward_zero_std": 0.0, "grad_norm": 1.4231737852096558, "kl": 0.01248026778921485, "learning_rate": 3.779186869818592e-07, "loss": 0.00012543797492980957, "reward": 0.8479166626930237, "reward_std": 0.24087008833885193, "rewards/DrugCombAccuracyCOTORM/mean": 0.8229166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.28198206424713135, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 410.5625, "completions/min_length": 367.0, "epoch": 12.41470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9947791695594788, "kl": 0.015406001824885607, "learning_rate": 3.77794242060315e-07, "loss": 0.00015500292647629976, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/mean_length": 404.5, "completions/min_length": 344.0, "epoch": 12.416176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.016611676663160324, "kl": 0.008758551441133022, "learning_rate": 3.7766980518937703e-07, "loss": 8.679234451847151e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 435.75, "completions/min_length": 387.0, "epoch": 12.41764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8083682060241699, "kl": 0.008228167076595128, "learning_rate": 3.7754537637724253e-07, "loss": 8.14275408629328e-05, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 8444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 433.375, "completions/min_length": 344.0, "epoch": 12.419117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 1.6216944456100464, "kl": 0.013620964251458645, "learning_rate": 3.774209556321088e-07, "loss": 0.00013500452041625977, "reward": 0.4026666581630707, "reward_std": 0.43770861625671387, "rewards/DrugCombAccuracyCOTORM/mean": 0.32625001668930054, "rewards/DrugCombAccuracyCOTORM/std": 0.47225522994995117, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.8819171190261841, "step": 8445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 426.5, "completions/min_length": 393.0, "epoch": 12.420588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.013044298626482487, "kl": 0.008139801910147071, "learning_rate": 3.772965429621723e-07, "loss": 8.065153087954968e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 460.4375, "completions/min_length": 408.0, "epoch": 12.422058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.272835612297058, "kl": 0.010263916221447289, "learning_rate": 3.7717213837562894e-07, "loss": 0.00010231487249257043, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/mean_length": 517.0, "completions/min_length": 400.0, "epoch": 12.423529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9475386142730713, "kl": 0.008892528829164803, "learning_rate": 3.7704774188067436e-07, "loss": 8.849793812260032e-05, "reward": 0.9629583358764648, "reward_std": 0.06874111294746399, "rewards/DrugCombAccuracyCOTORM/mean": 0.955651044845581, "rewards/DrugCombAccuracyCOTORM/std": 0.12143747508525848, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.042695630341768265, "step": 8448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 461.8125, "completions/min_length": 398.0, "epoch": 12.425, "frac_reward_zero_std": 0.5, "grad_norm": 1.1102007627487183, "kl": 0.01655996683984995, "learning_rate": 3.7692335348550346e-07, "loss": 0.00016368247452192008, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 460.3125, "completions/min_length": 411.0, "epoch": 12.426470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 1.2620410919189453, "kl": 0.011149058816954494, "learning_rate": 3.7679897319831044e-07, "loss": 0.00011094153160229325, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/mean_length": 442.9375, "completions/min_length": 370.0, "epoch": 12.427941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.020899469032883644, "kl": 0.010416928213089705, "learning_rate": 3.766746010272893e-07, "loss": 0.00010385299538029358, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 435.5625, "completions/min_length": 401.0, "epoch": 12.429411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.021591629832983017, "kl": 0.010164560982957482, "learning_rate": 3.7655023698063337e-07, "loss": 0.00010183239646721631, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 436.9375, "completions/min_length": 366.0, "epoch": 12.430882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.1855862140655518, "kl": 0.0156968334922567, "learning_rate": 3.7642588106653537e-07, "loss": 0.00016480653721373528, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 449.75, "completions/min_length": 398.0, "epoch": 12.43235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8596121668815613, "kl": 0.010407135472632945, "learning_rate": 3.763015332931877e-07, "loss": 0.0001036420144373551, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 443.875, "completions/min_length": 393.0, "epoch": 12.433823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.008317550644278526, "kl": 0.006897041690535843, "learning_rate": 3.7617719366878196e-07, "loss": 6.905969348736107e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 481.3125, "completions/min_length": 386.0, "epoch": 12.435294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.03265252336859703, "kl": 0.013031409587711096, "learning_rate": 3.7605286220150934e-07, "loss": 0.00013008122914470732, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 430.9375, "completions/min_length": 358.0, "epoch": 12.436764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.011157692410051823, "kl": 0.0071046469965949655, "learning_rate": 3.759285388995604e-07, "loss": 7.098969217622653e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 410.125, "completions/min_length": 345.0, "epoch": 12.438235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.029144831001758575, "kl": 0.006970649701543152, "learning_rate": 3.758042237711254e-07, "loss": 7.08908773958683e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 445.0625, "completions/min_length": 383.0, "epoch": 12.439705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.7348303198814392, "kl": 0.009646320599131286, "learning_rate": 3.7567991682439377e-07, "loss": 9.687513374956325e-05, "reward": 0.17499999701976776, "reward_std": 0.15811389684677124, "rewards/DrugCombAccuracyCOTORM/mean": 0.0625, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.25, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 8459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 475.25, "completions/min_length": 426.0, "epoch": 12.441176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.0838861465454102, "kl": 0.010073118610307574, "learning_rate": 3.7555561806755463e-07, "loss": 0.00010173022747039795, "reward": 0.45891666412353516, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.45125001668930054, "rewards/DrugCombAccuracyCOTORM/std": 0.502684473991394, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.020833313465118408, "rewards/DrugCombCoverageCOTORM/std": 1.0144785642623901, "step": 8460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 431.625, "completions/min_length": 385.0, "epoch": 12.44264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.01040361262857914, "kl": 0.00674956978764385, "learning_rate": 3.7543132750879656e-07, "loss": 6.755991489626467e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/mean_length": 419.5625, "completions/min_length": 388.0, "epoch": 12.444117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.013842134736478329, "kl": 0.008187599829398096, "learning_rate": 3.753070451563073e-07, "loss": 8.206514030462131e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/mean_length": 485.3125, "completions/min_length": 408.0, "epoch": 12.445588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.9363948106765747, "kl": 0.01467754296027124, "learning_rate": 3.7518277101827436e-07, "loss": 0.00014684205234516412, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 435.25, "completions/min_length": 370.0, "epoch": 12.447058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.0722498893737793, "kl": 0.01134144701063633, "learning_rate": 3.750585051028845e-07, "loss": 0.00011264999920967966, "reward": 0.7875000238418579, "reward_std": 0.2295181304216385, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 422.625, "completions/min_length": 360.0, "epoch": 12.448529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 1.115084171295166, "kl": 0.01455408032052219, "learning_rate": 3.7493424741832425e-07, "loss": 0.00014488399028778076, "reward": 0.737500011920929, "reward_std": 0.2199837565422058, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 435.75, "completions/min_length": 391.0, "epoch": 12.45, "frac_reward_zero_std": 1.0, "grad_norm": 0.014104623347520828, "kl": 0.008475169423036277, "learning_rate": 3.7480999797277917e-07, "loss": 8.484671707265079e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 443.6875, "completions/min_length": 379.0, "epoch": 12.451470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.029126152396202087, "kl": 0.009988203993998468, "learning_rate": 3.746857567744348e-07, "loss": 9.8637436167337e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/mean_length": 514.8125, "completions/min_length": 409.0, "epoch": 12.452941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.014785725623369217, "kl": 0.007603420177474618, "learning_rate": 3.745615238314755e-07, "loss": 7.646388257853687e-05, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.12909944355487823, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/mean_length": 462.625, "completions/min_length": 359.0, "epoch": 12.454411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.012319201603531837, "kl": 0.009790246491320431, "learning_rate": 3.7443729915208557e-07, "loss": 9.86416926025413e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 484.5, "completions/min_length": 397.0, "epoch": 12.455882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.9057915806770325, "kl": 0.009386180434376001, "learning_rate": 3.743130827444486e-07, "loss": 9.433180093765259e-05, "reward": 0.8260416984558105, "reward_std": 0.021564556285738945, "rewards/DrugCombAccuracyCOTORM/mean": 0.8020833730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.21273136138916016, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.23935678601264954, "step": 8470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 471.9375, "completions/min_length": 408.0, "epoch": 12.45735294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.6657264232635498, "kl": 0.010326511226594448, "learning_rate": 3.7418887461674765e-07, "loss": 0.0001032799482345581, "reward": 0.8500000238418579, "reward_std": 0.3265853524208069, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 454.5625, "completions/min_length": 380.0, "epoch": 12.458823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.1429400444030762, "kl": 0.008595354040153325, "learning_rate": 3.740646747771653e-07, "loss": 8.534372318536043e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 8472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 439.9375, "completions/min_length": 369.0, "epoch": 12.46029411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.012402934953570366, "kl": 0.008616854320280254, "learning_rate": 3.7394048323388356e-07, "loss": 8.668737427797168e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 8473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 434.0, "completions/min_length": 355.0, "epoch": 12.461764705882352, "frac_reward_zero_std": 0.5, "grad_norm": 1.0032567977905273, "kl": 0.007978471228852868, "learning_rate": 3.738162999950837e-07, "loss": 7.905438542366028e-05, "reward": 0.887499988079071, "reward_std": 0.21001701056957245, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 461.0, "completions/min_length": 406.0, "epoch": 12.463235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.2050434350967407, "kl": 0.00953456514980644, "learning_rate": 3.736921250689466e-07, "loss": 9.478998981649056e-05, "reward": 0.942187488079071, "reward_std": 0.16351844370365143, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 447.875, "completions/min_length": 391.0, "epoch": 12.464705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.7269366383552551, "kl": 0.009134130785241723, "learning_rate": 3.7356795846365263e-07, "loss": 9.091198444366455e-05, "reward": 0.656166672706604, "reward_std": 0.04289780929684639, "rewards/DrugCombAccuracyCOTORM/mean": 0.5962499976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.4203629493713379, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.2687419056892395, "step": 8476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 449.5625, "completions/min_length": 410.0, "epoch": 12.466176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.011732857674360275, "kl": 0.008230243460275233, "learning_rate": 3.7344380018738164e-07, "loss": 8.193504618247971e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/mean_length": 413.625, "completions/min_length": 367.0, "epoch": 12.467647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.01270572654902935, "kl": 0.008619499974884093, "learning_rate": 3.7331965024831285e-07, "loss": 8.633120887679979e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 476.8125, "completions/min_length": 428.0, "epoch": 12.469117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.01189874392002821, "kl": 0.009522815234959126, "learning_rate": 3.73195508654625e-07, "loss": 9.597370080882683e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/mean_length": 529.3125, "completions/min_length": 428.0, "epoch": 12.470588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.8216876983642578, "kl": 0.0096269641071558, "learning_rate": 3.7307137541449605e-07, "loss": 9.69551329035312e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 8480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/mean_length": 415.6875, "completions/min_length": 380.0, "epoch": 12.472058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.03285793960094452, "kl": 0.008359680767171085, "learning_rate": 3.7294725053610366e-07, "loss": 8.419812365900725e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 454.5, "completions/min_length": 376.0, "epoch": 12.473529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.0276786088943481, "kl": 0.011342508252710104, "learning_rate": 3.7282313402762483e-07, "loss": 0.00011225789785385132, "reward": 0.6551250219345093, "reward_std": 0.21953549981117249, "rewards/DrugCombAccuracyCOTORM/mean": 0.6353124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.48780280351638794, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.46875, "rewards/DrugCombCoverageCOTORM/std": 0.8844725489616394, "step": 8482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 435.8125, "completions/min_length": 358.0, "epoch": 12.475, "frac_reward_zero_std": 1.0, "grad_norm": 0.011217737570405006, "kl": 0.008286229218356311, "learning_rate": 3.726990258972361e-07, "loss": 8.16471510916017e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/mean_length": 502.0625, "completions/min_length": 409.0, "epoch": 12.476470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.0005171298980713, "kl": 0.009270518785342574, "learning_rate": 3.725749261531134e-07, "loss": 9.288000728702173e-05, "reward": 0.942187488079071, "reward_std": 0.16351844370365143, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 478.5, "completions/min_length": 421.0, "epoch": 12.477941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.015654386952519417, "kl": 0.009043837431818247, "learning_rate": 3.724508348034322e-07, "loss": 9.007314656628296e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 425.3125, "completions/min_length": 355.0, "epoch": 12.479411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.018105190247297287, "kl": 0.009009306784719229, "learning_rate": 3.7232675185636716e-07, "loss": 9.059202420758083e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 453.875, "completions/min_length": 379.0, "epoch": 12.480882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.009385965764522552, "kl": 0.0074670977191999555, "learning_rate": 3.722026773200925e-07, "loss": 7.423140050377697e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 454.1875, "completions/min_length": 407.0, "epoch": 12.48235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.013498625718057156, "kl": 0.008552705054171383, "learning_rate": 3.7207861120278215e-07, "loss": 8.663129119668156e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 462.875, "completions/min_length": 381.0, "epoch": 12.483823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.014585843309760094, "kl": 0.009832095936872065, "learning_rate": 3.719545535126091e-07, "loss": 9.692672756500542e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 450.1875, "completions/min_length": 375.0, "epoch": 12.485294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.011413595639169216, "kl": 0.008553074090741575, "learning_rate": 3.7183050425774603e-07, "loss": 8.584180613979697e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 489.5625, "completions/min_length": 403.0, "epoch": 12.486764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9866697192192078, "kl": 0.00967286853119731, "learning_rate": 3.7170646344636515e-07, "loss": 9.652784501668066e-05, "reward": 0.6434999704360962, "reward_std": 0.15202443301677704, "rewards/DrugCombAccuracyCOTORM/mean": 0.5934374928474426, "rewards/DrugCombAccuracyCOTORM/std": 0.47937363386154175, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 8491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 473.75, "completions/min_length": 439.0, "epoch": 12.488235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.038476824760437, "kl": 0.01046356197912246, "learning_rate": 3.715824310866376e-07, "loss": 0.00010477586329216138, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 407.3125, "completions/min_length": 329.0, "epoch": 12.489705882352942, "frac_reward_zero_std": 0.0, "grad_norm": 1.7209771871566772, "kl": 0.01236547389999032, "learning_rate": 3.714584071867345e-07, "loss": 0.00012359395623207092, "reward": 0.518583357334137, "reward_std": 0.39685916900634766, "rewards/DrugCombAccuracyCOTORM/mean": 0.42166668176651, "rewards/DrugCombAccuracyCOTORM/std": 0.49750491976737976, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 8493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 422.4375, "completions/min_length": 361.0, "epoch": 12.491176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.01333469059318304, "kl": 0.008237684960477054, "learning_rate": 3.713343917548263e-07, "loss": 8.220259769586846e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/mean_length": 417.25, "completions/min_length": 370.0, "epoch": 12.492647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.022933796048164368, "kl": 0.011108255828730762, "learning_rate": 3.7121038479908275e-07, "loss": 0.00011090606130892411, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/mean_length": 435.125, "completions/min_length": 355.0, "epoch": 12.494117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.01794993318617344, "kl": 0.010836946545168757, "learning_rate": 3.710863863276731e-07, "loss": 0.00010940741776721552, "reward": 0.625333309173584, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5733333230018616, "rewards/DrugCombAccuracyCOTORM/std": 0.44065946340560913, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6666666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.3442651927471161, "step": 8496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/mean_length": 392.875, "completions/min_length": 336.0, "epoch": 12.495588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.008638552390038967, "kl": 0.006610788055695593, "learning_rate": 3.709623963487662e-07, "loss": 6.581265188287944e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 409.625, "completions/min_length": 341.0, "epoch": 12.49705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.008901705965399742, "kl": 0.006408510846085846, "learning_rate": 3.7083841487052993e-07, "loss": 6.444532482419163e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 472.125, "completions/min_length": 432.0, "epoch": 12.498529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.8788047432899475, "kl": 0.011470049270428717, "learning_rate": 3.7071444190113207e-07, "loss": 0.00011539069237187505, "reward": 0.9026666879653931, "reward_std": 0.18312038481235504, "rewards/DrugCombAccuracyCOTORM/mean": 0.8887500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.30663496255874634, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 8499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 435.5625, "completions/min_length": 389.0, "epoch": 12.5, "frac_reward_zero_std": 0.5, "grad_norm": 1.1064784526824951, "kl": 0.011990908533334732, "learning_rate": 3.7059047744873955e-07, "loss": 0.00011878243094542995, "reward": 0.7437499761581421, "reward_std": 0.21286733448505402, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 422.4375, "completions/min_length": 381.0, "epoch": 12.501470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.012233572080731392, "kl": 0.01043720543384552, "learning_rate": 3.704665215215189e-07, "loss": 0.00010362875764258206, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 505.3125, "completions/min_length": 449.0, "epoch": 12.50294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.01380434725433588, "kl": 0.008128103916533291, "learning_rate": 3.7034257412763605e-07, "loss": 8.1697988207452e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 466.25, "completions/min_length": 393.0, "epoch": 12.504411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.024872224777936935, "kl": 0.010865869815461338, "learning_rate": 3.7021863527525634e-07, "loss": 0.00010707411274779588, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 429.0625, "completions/min_length": 360.0, "epoch": 12.505882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.8309320211410522, "kl": 0.010464497492648661, "learning_rate": 3.700947049725444e-07, "loss": 0.00010468065738677979, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 439.0625, "completions/min_length": 387.0, "epoch": 12.507352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.017954137176275253, "kl": 0.009665684308856726, "learning_rate": 3.699707832276646e-07, "loss": 9.679578943178058e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 428.375, "completions/min_length": 309.0, "epoch": 12.508823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.022516217082738876, "kl": 0.008879870758391917, "learning_rate": 3.6984687004878044e-07, "loss": 8.908704330679029e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 456.6875, "completions/min_length": 385.0, "epoch": 12.510294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.9388315677642822, "kl": 0.01767319138161838, "learning_rate": 3.6972296544405524e-07, "loss": 0.00017649000801611692, "reward": 0.7749999761581421, "reward_std": 0.24348656833171844, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 8507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 462.1875, "completions/min_length": 380.0, "epoch": 12.511764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.010895686224102974, "kl": 0.008650850388221443, "learning_rate": 3.695990694216513e-07, "loss": 8.659667946631089e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 483.375, "completions/min_length": 408.0, "epoch": 12.513235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1771782636642456, "kl": 0.010557946981862187, "learning_rate": 3.694751819897308e-07, "loss": 0.00010526068217586726, "reward": 0.8013020753860474, "reward_std": 0.016552355140447617, "rewards/DrugCombAccuracyCOTORM/mean": 0.7604166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.25069350004196167, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9296875, "rewards/DrugCombCoverageCOTORM/std": 0.12884704768657684, "step": 8509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/mean_length": 512.4375, "completions/min_length": 382.0, "epoch": 12.514705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.1986743211746216, "kl": 0.01000380318146199, "learning_rate": 3.6935130315645484e-07, "loss": 9.942799806594849e-05, "reward": 0.9520416855812073, "reward_std": 0.06830496340990067, "rewards/DrugCombAccuracyCOTORM/mean": 0.9439583420753479, "rewards/DrugCombAccuracyCOTORM/std": 0.12388297915458679, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.06718549132347107, "step": 8510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 423.25, "completions/min_length": 363.0, "epoch": 12.516176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.010660490021109581, "kl": 0.0074378360295668244, "learning_rate": 3.6922743292998445e-07, "loss": 7.428480603266507e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 412.75, "completions/min_length": 352.0, "epoch": 12.51764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.016575084999203682, "kl": 0.00850804045330733, "learning_rate": 3.691035713184798e-07, "loss": 8.569806232117116e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 427.875, "completions/min_length": 392.0, "epoch": 12.519117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.00890609435737133, "kl": 0.0065780250588431954, "learning_rate": 3.6897971833010065e-07, "loss": 6.592535646632314e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/mean_length": 465.25, "completions/min_length": 362.0, "epoch": 12.520588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.8612769246101379, "kl": 0.010128593887202442, "learning_rate": 3.6885587397300606e-07, "loss": 0.0001010300766211003, "reward": 0.6270833015441895, "reward_std": 0.0766032412648201, "rewards/DrugCombAccuracyCOTORM/mean": 0.5416666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 434.3125, "completions/min_length": 399.0, "epoch": 12.522058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 1.0105655193328857, "kl": 0.013098653173074126, "learning_rate": 3.687320382553547e-07, "loss": 0.00013113021850585938, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 8515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 454.375, "completions/min_length": 390.0, "epoch": 12.523529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.1734091341495514, "kl": 0.020066512748599052, "learning_rate": 3.686082111853044e-07, "loss": 0.00020220605074428022, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 435.8125, "completions/min_length": 379.0, "epoch": 12.525, "frac_reward_zero_std": 1.0, "grad_norm": 0.029552219435572624, "kl": 0.011591706308536232, "learning_rate": 3.6848439277101256e-07, "loss": 0.00011428882862674072, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 422.0, "completions/min_length": 339.0, "epoch": 12.526470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.8464282751083374, "kl": 0.011484029237180948, "learning_rate": 3.6836058302063615e-07, "loss": 0.00011543929576873779, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 515.6875, "completions/min_length": 462.0, "epoch": 12.527941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.7810004353523254, "kl": 0.0084772469708696, "learning_rate": 3.6823678194233145e-07, "loss": 8.419901132583618e-05, "reward": 0.5484374761581421, "reward_std": 0.0044194171205163, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 8519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 413.75, "completions/min_length": 341.0, "epoch": 12.529411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.020688941702246666, "kl": 0.008680448634549975, "learning_rate": 3.6811298954425395e-07, "loss": 8.58902494655922e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 455.125, "completions/min_length": 389.0, "epoch": 12.530882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.8622391819953918, "kl": 0.013549414929002523, "learning_rate": 3.6798920583455915e-07, "loss": 0.00013301894068717957, "reward": 0.7040250301361084, "reward_std": 0.19708573818206787, "rewards/DrugCombAccuracyCOTORM/mean": 0.6683124899864197, "rewards/DrugCombAccuracyCOTORM/std": 0.4483637809753418, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6937500238418579, "rewards/DrugCombCoverageCOTORM/std": 0.6725263595581055, "step": 8521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 445.875, "completions/min_length": 411.0, "epoch": 12.532352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.0528318881988525, "kl": 0.0140669378452003, "learning_rate": 3.678654308214013e-07, "loss": 0.00014194101095199585, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 442.5, "completions/min_length": 407.0, "epoch": 12.533823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.05544114112854, "kl": 0.0098196571925655, "learning_rate": 3.6774166451293444e-07, "loss": 9.850058995652944e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 432.3125, "completions/min_length": 377.0, "epoch": 12.535294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.1229217052459717, "kl": 0.012883809860795736, "learning_rate": 3.6761790691731204e-07, "loss": 0.0001276816037716344, "reward": 0.8374999761581421, "reward_std": 0.22638462483882904, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 421.3125, "completions/min_length": 364.0, "epoch": 12.536764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.03158752992749214, "kl": 0.010964864282868803, "learning_rate": 3.674941580426869e-07, "loss": 0.00010914894664892927, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 431.25, "completions/min_length": 354.0, "epoch": 12.538235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.009389403276145458, "kl": 0.0073084363248199224, "learning_rate": 3.673704178972113e-07, "loss": 7.353344699367881e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 438.6875, "completions/min_length": 379.0, "epoch": 12.53970588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.01062565203756094, "kl": 0.007504485547542572, "learning_rate": 3.67246686489037e-07, "loss": 7.562625251011923e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 8527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 468.9375, "completions/min_length": 433.0, "epoch": 12.541176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9049856066703796, "kl": 0.011175690218806267, "learning_rate": 3.67122963826315e-07, "loss": 0.00011111795902252197, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/mean_length": 588.375, "completions/min_length": 532.0, "epoch": 12.54264705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.135611653327942, "kl": 0.009450335055589676, "learning_rate": 3.6699924991719583e-07, "loss": 9.551644325256348e-05, "reward": 0.8932833671569824, "reward_std": 0.21116474270820618, "rewards/DrugCombAccuracyCOTORM/mean": 0.8707708120346069, "rewards/DrugCombAccuracyCOTORM/std": 0.2727417051792145, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9666666984558105, "rewards/DrugCombCoverageCOTORM/std": 0.07200822979211807, "step": 8529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 426.0, "completions/min_length": 396.0, "epoch": 12.544117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.014854486100375652, "kl": 0.007386529119685292, "learning_rate": 3.668755447698295e-07, "loss": 7.429506513290107e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/mean_length": 420.9375, "completions/min_length": 394.0, "epoch": 12.545588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.010855969972908497, "kl": 0.005867588333785534, "learning_rate": 3.667518483923654e-07, "loss": 5.868220250704326e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 442.1875, "completions/min_length": 402.0, "epoch": 12.547058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.01584777981042862, "kl": 0.008333766483701766, "learning_rate": 3.666281607929523e-07, "loss": 8.352426812052727e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 424.3125, "completions/min_length": 385.0, "epoch": 12.548529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.008922533132135868, "kl": 0.007214218960143626, "learning_rate": 3.665044819797385e-07, "loss": 7.206027657957748e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 502.875, "completions/min_length": 434.0, "epoch": 12.55, "frac_reward_zero_std": 0.0, "grad_norm": 1.3739585876464844, "kl": 0.010408237809315324, "learning_rate": 3.6638081196087153e-07, "loss": 0.00010347366333007812, "reward": 0.4624999761581421, "reward_std": 0.44116348028182983, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 8534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 442.3125, "completions/min_length": 381.0, "epoch": 12.551470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.03196616470813751, "kl": 0.009705241420306265, "learning_rate": 3.662571507444986e-07, "loss": 9.739508095663041e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 479.5, "completions/min_length": 406.0, "epoch": 12.552941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9676263928413391, "kl": 0.011156405555084348, "learning_rate": 3.66133498338766e-07, "loss": 0.00011147191980853677, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/mean_length": 416.4375, "completions/min_length": 373.0, "epoch": 12.554411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.017064273357391357, "kl": 0.010284498566761613, "learning_rate": 3.660098547518198e-07, "loss": 0.00010351771197747439, "reward": 0.6713333129882812, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6100000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.40279027819633484, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 8537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 466.1875, "completions/min_length": 412.0, "epoch": 12.555882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.15463125705719, "kl": 0.01189522142522037, "learning_rate": 3.658862199918054e-07, "loss": 0.00011831862502731383, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 443.625, "completions/min_length": 364.0, "epoch": 12.55735294117647, "frac_reward_zero_std": 0.0, "grad_norm": 2179.912109375, "kl": 7.47409054543823, "learning_rate": 3.6576259406686727e-07, "loss": 0.08248215168714523, "reward": 0.8312499523162842, "reward_std": 0.3617092967033386, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5439056158065796, "step": 8539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 442.3125, "completions/min_length": 398.0, "epoch": 12.558823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.0982285737991333, "kl": 0.010205844882875681, "learning_rate": 3.656389769851498e-07, "loss": 0.00010250808554701507, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 446.1875, "completions/min_length": 396.0, "epoch": 12.560294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.870140016078949, "kl": 0.009953885106369853, "learning_rate": 3.655153687547965e-07, "loss": 9.898897405946627e-05, "reward": 0.875, "reward_std": 0.18322508037090302, "rewards/DrugCombAccuracyCOTORM/mean": 0.84375, "rewards/DrugCombAccuracyCOTORM/std": 0.3520771861076355, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 477.4375, "completions/min_length": 412.0, "epoch": 12.561764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 1.0328534841537476, "kl": 0.012729272479191422, "learning_rate": 3.653917693839503e-07, "loss": 0.00012997639714740217, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 8542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 437.8125, "completions/min_length": 379.0, "epoch": 12.563235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 0.8617289662361145, "kl": 0.014518820447847247, "learning_rate": 3.6526817888075375e-07, "loss": 0.000145837664604187, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 484.875, "completions/min_length": 409.0, "epoch": 12.564705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.099609375, "kl": 0.009265313390642405, "learning_rate": 3.6514459725334867e-07, "loss": 9.293168841395527e-05, "reward": 0.8999999761581421, "reward_std": 0.10690449178218842, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.22360680997371674, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 467.5625, "completions/min_length": 418.0, "epoch": 12.566176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.8617558479309082, "kl": 0.010208509396761656, "learning_rate": 3.6502102450987617e-07, "loss": 0.00010203720012214035, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 481.875, "completions/min_length": 394.0, "epoch": 12.56764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.015242126770317554, "kl": 0.009595826966688037, "learning_rate": 3.6489746065847694e-07, "loss": 9.570772817824036e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/mean_length": 494.75, "completions/min_length": 388.0, "epoch": 12.569117647058823, "frac_reward_zero_std": 0.0, "grad_norm": 1.2552282810211182, "kl": 0.009533363161608577, "learning_rate": 3.647739057072912e-07, "loss": 9.590387344360352e-05, "reward": 0.671875, "reward_std": 0.24951976537704468, "rewards/DrugCombAccuracyCOTORM/mean": 0.59375, "rewards/DrugCombAccuracyCOTORM/std": 0.41708314418792725, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 8547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 453.875, "completions/min_length": 366.0, "epoch": 12.570588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.155149221420288, "kl": 0.013560438761487603, "learning_rate": 3.6465035966445823e-07, "loss": 0.00013528861745726317, "reward": 0.5874999761581421, "reward_std": 0.0353553369641304, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 464.5625, "completions/min_length": 399.0, "epoch": 12.572058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.3007932901382446, "kl": 0.010716775199398398, "learning_rate": 3.645268225381171e-07, "loss": 0.00010632723569869995, "reward": 0.75, "reward_std": 0.39218372106552124, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 443.3125, "completions/min_length": 392.0, "epoch": 12.573529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 0.9499661922454834, "kl": 0.010597578948363662, "learning_rate": 3.6440329433640617e-07, "loss": 0.00010701939754653722, "reward": 0.9551249742507935, "reward_std": 0.12692566215991974, "rewards/DrugCombAccuracyCOTORM/mean": 0.9478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.20874999463558197, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 8550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 437.125, "completions/min_length": 404.0, "epoch": 12.575, "frac_reward_zero_std": 0.5, "grad_norm": 0.9159919023513794, "kl": 0.012184340739622712, "learning_rate": 3.642797750674629e-07, "loss": 0.00012355043145362288, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 8551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 470.4375, "completions/min_length": 403.0, "epoch": 12.576470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8635705709457397, "kl": 0.010548043763265014, "learning_rate": 3.641562647394246e-07, "loss": 0.00010445713996887207, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 506.6875, "completions/min_length": 469.0, "epoch": 12.577941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.3346236944198608, "kl": 0.011053812457248569, "learning_rate": 3.640327633604278e-07, "loss": 0.00011077895760536194, "reward": 0.5770833492279053, "reward_std": 0.37877243757247925, "rewards/DrugCombAccuracyCOTORM/mean": 0.4791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.4669642150402069, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.17078252136707306, "step": 8553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 440.1875, "completions/min_length": 386.0, "epoch": 12.579411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.0616687536239624, "kl": 0.012105724308639765, "learning_rate": 3.6390927093860846e-07, "loss": 0.00012052888632752001, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/mean_length": 502.9375, "completions/min_length": 375.0, "epoch": 12.580882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.9534409642219543, "kl": 0.01152965473011136, "learning_rate": 3.63785787482102e-07, "loss": 0.00011565536260604858, "reward": 0.8401747941970825, "reward_std": 0.06477773189544678, "rewards/DrugCombAccuracyCOTORM/mean": 0.8158434629440308, "rewards/DrugCombAccuracyCOTORM/std": 0.22163459658622742, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 470.8125, "completions/min_length": 416.0, "epoch": 12.58235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.009928297251462936, "kl": 0.007860647863708436, "learning_rate": 3.636623129990432e-07, "loss": 7.835702854208648e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/mean_length": 472.875, "completions/min_length": 402.0, "epoch": 12.583823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.9424166679382324, "kl": 0.008720037643797696, "learning_rate": 3.6353884749756614e-07, "loss": 8.653975964989513e-05, "reward": 0.637499988079071, "reward_std": 0.22638462483882904, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 8557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/mean_length": 487.8125, "completions/min_length": 412.0, "epoch": 12.58529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.2963286638259888, "kl": 0.010457611409947276, "learning_rate": 3.6341539098580443e-07, "loss": 0.00010497868061065674, "reward": 0.6600833535194397, "reward_std": 0.35227903723716736, "rewards/DrugCombAccuracyCOTORM/mean": 0.6037499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.4699627757072449, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7708333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.49767982959747314, "step": 8558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 459.9375, "completions/min_length": 370.0, "epoch": 12.586764705882352, "frac_reward_zero_std": 1.0, "grad_norm": 0.01356601808220148, "kl": 0.008906224393285811, "learning_rate": 3.632919434718911e-07, "loss": 8.916632941691205e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 458.75, "completions/min_length": 419.0, "epoch": 12.588235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.7978373169898987, "kl": 0.006753626395948231, "learning_rate": 3.6316850496395855e-07, "loss": 6.764382123947144e-05, "reward": 0.824999988079071, "reward_std": 0.24348656833171844, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 8560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 428.125, "completions/min_length": 370.0, "epoch": 12.589705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.01462625339627266, "kl": 0.009813516866415739, "learning_rate": 3.6304507547013864e-07, "loss": 9.849316847976297e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 468.3125, "completions/min_length": 407.0, "epoch": 12.591176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.010001500137150288, "kl": 0.007431019097566605, "learning_rate": 3.6292165499856263e-07, "loss": 7.443693175446242e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/mean_length": 507.0625, "completions/min_length": 416.0, "epoch": 12.592647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.8146217465400696, "kl": 0.010234865709207952, "learning_rate": 3.6279824355736105e-07, "loss": 0.00010241441486869007, "reward": 0.9520833492279053, "reward_std": 0.090495765209198, "rewards/DrugCombAccuracyCOTORM/mean": 0.9479166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.145535409450531, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.17078252136707306, "step": 8563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 457.375, "completions/min_length": 388.0, "epoch": 12.594117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.8597237467765808, "kl": 0.011573604308068752, "learning_rate": 3.6267484115466397e-07, "loss": 0.00011458247900009155, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 474.875, "completions/min_length": 421.0, "epoch": 12.595588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.012506386265158653, "kl": 0.007830482558347285, "learning_rate": 3.625514477986008e-07, "loss": 7.818800077075139e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 450.4375, "completions/min_length": 379.0, "epoch": 12.597058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.009339109994471073, "kl": 0.006790034822188318, "learning_rate": 3.6242806349730036e-07, "loss": 6.788251630496234e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/mean_length": 462.875, "completions/min_length": 378.0, "epoch": 12.598529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.0115118026733398, "kl": 0.009722030139528215, "learning_rate": 3.6230468825889103e-07, "loss": 9.888301428873092e-05, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 8567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 421.0625, "completions/min_length": 343.0, "epoch": 12.6, "frac_reward_zero_std": 0.5, "grad_norm": 1.1933667659759521, "kl": 0.010186862200498581, "learning_rate": 3.621813220915004e-07, "loss": 0.00010175575880566612, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 440.625, "completions/min_length": 406.0, "epoch": 12.601470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8626539707183838, "kl": 0.012982938787899911, "learning_rate": 3.620579650032555e-07, "loss": 0.0001296355330850929, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 488.5, "completions/min_length": 437.0, "epoch": 12.602941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.018913527950644493, "kl": 0.008684304193593562, "learning_rate": 3.619346170022827e-07, "loss": 8.665829955134541e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 502.3125, "completions/min_length": 416.0, "epoch": 12.604411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.7114490270614624, "kl": 0.014549633022397757, "learning_rate": 3.61811278096708e-07, "loss": 0.00014426559209823608, "reward": 0.6312500238418579, "reward_std": 0.40867096185684204, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 8571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 471.125, "completions/min_length": 396.0, "epoch": 12.605882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.8202426433563232, "kl": 0.009288348956033587, "learning_rate": 3.616879482946565e-07, "loss": 9.326577128376812e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 477.3125, "completions/min_length": 416.0, "epoch": 12.60735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0437263250350952, "kl": 0.007892152178101242, "learning_rate": 3.61564627604253e-07, "loss": 7.879085023887455e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 8573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 485.25, "completions/min_length": 446.0, "epoch": 12.608823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.9512550234794617, "kl": 0.00866275536827743, "learning_rate": 3.6144131603362154e-07, "loss": 8.740757766645402e-05, "reward": 0.6000000238418579, "reward_std": 0.16256865859031677, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.632455587387085, "step": 8574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 472.375, "completions/min_length": 396.0, "epoch": 12.610294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.013281905092298985, "kl": 0.008738302742131054, "learning_rate": 3.613180135908854e-07, "loss": 8.803018863545731e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/mean_length": 436.8125, "completions/min_length": 372.0, "epoch": 12.611764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8305985927581787, "kl": 0.008889297139830887, "learning_rate": 3.611947202841677e-07, "loss": 8.825506665743887e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 8576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/mean_length": 413.375, "completions/min_length": 376.0, "epoch": 12.613235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.008218061178922653, "kl": 0.006932765361852944, "learning_rate": 3.610714361215905e-07, "loss": 6.973739073146135e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 440.3125, "completions/min_length": 374.0, "epoch": 12.614705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.10671014338731766, "kl": 0.013793263584375381, "learning_rate": 3.6094816111127547e-07, "loss": 0.00013290121569298208, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/mean_length": 495.5625, "completions/min_length": 404.0, "epoch": 12.616176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.04574501886963844, "kl": 0.00882438535336405, "learning_rate": 3.608248952613437e-07, "loss": 8.926775626605377e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 428.375, "completions/min_length": 391.0, "epoch": 12.617647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.16544847190380096, "kl": 0.014572509215213358, "learning_rate": 3.6070163857991577e-07, "loss": 0.00014506539446301758, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 462.1875, "completions/min_length": 406.0, "epoch": 12.619117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.9773102402687073, "kl": 0.008881157729774714, "learning_rate": 3.6057839107511123e-07, "loss": 8.883481496013701e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 460.375, "completions/min_length": 337.0, "epoch": 12.620588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.02640368416905403, "kl": 0.00956756726372987, "learning_rate": 3.6045515275504943e-07, "loss": 9.539686288917437e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 448.8125, "completions/min_length": 401.0, "epoch": 12.62205882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.8928179144859314, "kl": 0.010504352627322078, "learning_rate": 3.60331923627849e-07, "loss": 0.00010405525972601026, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/mean_length": 496.25, "completions/min_length": 409.0, "epoch": 12.623529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.7361050248146057, "kl": 0.006489357096143067, "learning_rate": 3.60208703701628e-07, "loss": 6.495350680779666e-05, "reward": 0.8837500214576721, "reward_std": 0.10845870524644852, "rewards/DrugCombAccuracyCOTORM/mean": 0.8729166984558105, "rewards/DrugCombAccuracyCOTORM/std": 0.20699571073055267, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.27131369709968567, "step": 8584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 448.4375, "completions/min_length": 383.0, "epoch": 12.625, "frac_reward_zero_std": 1.0, "grad_norm": 0.018233444541692734, "kl": 0.008699684171006083, "learning_rate": 3.6008549298450396e-07, "loss": 8.689312380738556e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 465.25, "completions/min_length": 426.0, "epoch": 12.626470588235295, "frac_reward_zero_std": 0.0, "grad_norm": 1.4656981229782104, "kl": 0.010778606170788407, "learning_rate": 3.599622914845936e-07, "loss": 0.00010824203491210938, "reward": 0.8937499523162842, "reward_std": 0.3005203604698181, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 475.125, "completions/min_length": 425.0, "epoch": 12.62794117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.4233306646347046, "kl": 0.012188653461635113, "learning_rate": 3.5983909921001286e-07, "loss": 0.0001221969723701477, "reward": 0.7928333282470703, "reward_std": 0.38331830501556396, "rewards/DrugCombAccuracyCOTORM/mean": 0.7775000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.4020530879497528, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7083333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.6763190627098083, "step": 8587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 445.9375, "completions/min_length": 355.0, "epoch": 12.629411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.020717192441225052, "kl": 0.009220479521900415, "learning_rate": 3.5971591616887767e-07, "loss": 9.251225856132805e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 441.6875, "completions/min_length": 386.0, "epoch": 12.630882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.0490264892578125, "kl": 0.010591936763375998, "learning_rate": 3.595927423693029e-07, "loss": 0.00010712294897530228, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 440.3125, "completions/min_length": 413.0, "epoch": 12.632352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.008677704259753227, "kl": 0.00729328824672848, "learning_rate": 3.594695778194029e-07, "loss": 7.321788143599406e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 458.625, "completions/min_length": 404.0, "epoch": 12.633823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.021348029375076294, "kl": 0.009226745227351785, "learning_rate": 3.5934642252729155e-07, "loss": 9.262756793759763e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/mean_length": 408.0625, "completions/min_length": 361.0, "epoch": 12.635294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.024284256622195244, "kl": 0.008926520007662475, "learning_rate": 3.5922327650108196e-07, "loss": 8.942066051531583e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 444.9375, "completions/min_length": 352.0, "epoch": 12.636764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8180159330368042, "kl": 0.007508874638006091, "learning_rate": 3.5910013974888666e-07, "loss": 7.532114250352606e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 481.875, "completions/min_length": 435.0, "epoch": 12.638235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.1934819221496582, "kl": 0.013470628531649709, "learning_rate": 3.5897701227881743e-07, "loss": 0.00013399869203567505, "reward": 0.6044166684150696, "reward_std": 0.4236293137073517, "rewards/DrugCombAccuracyCOTORM/mean": 0.5550000071525574, "rewards/DrugCombAccuracyCOTORM/std": 0.4665619134902954, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6041666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.4901813864707947, "step": 8594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 454.0, "completions/min_length": 381.0, "epoch": 12.639705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.012835383415222168, "kl": 0.009886445477604866, "learning_rate": 3.588538940989858e-07, "loss": 9.854033123701811e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/mean_length": 491.5625, "completions/min_length": 413.0, "epoch": 12.641176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 1.2326931953430176, "kl": 0.01289026951417327, "learning_rate": 3.587307852175024e-07, "loss": 0.00012939423322677612, "reward": 0.6706666946411133, "reward_std": 0.2588162422180176, "rewards/DrugCombAccuracyCOTORM/mean": 0.60916668176651, "rewards/DrugCombAccuracyCOTORM/std": 0.39045557379722595, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.22771000862121582, "step": 8596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/mean_length": 489.6875, "completions/min_length": 402.0, "epoch": 12.64264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.991217851638794, "kl": 0.007988461060449481, "learning_rate": 3.5860768564247743e-07, "loss": 8.034386701183394e-05, "reward": 0.9375, "reward_std": 0.13822859525680542, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 8597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 476.125, "completions/min_length": 428.0, "epoch": 12.644117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.025073066353797913, "kl": 0.010707187815569341, "learning_rate": 3.5848459538202045e-07, "loss": 0.00010733096132753417, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 468.75, "completions/min_length": 412.0, "epoch": 12.645588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.010231180116534233, "kl": 0.007743399008177221, "learning_rate": 3.5836151444424e-07, "loss": 7.756602281006053e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 439.4375, "completions/min_length": 388.0, "epoch": 12.647058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.02256624400615692, "kl": 0.008631904609501362, "learning_rate": 3.582384428372446e-07, "loss": 8.620467269793153e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/mean_length": 500.6875, "completions/min_length": 434.0, "epoch": 12.648529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.011969047598540783, "kl": 0.008378663565963507, "learning_rate": 3.581153805691418e-07, "loss": 8.344520756509155e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 465.5625, "completions/min_length": 415.0, "epoch": 12.65, "frac_reward_zero_std": 0.5, "grad_norm": 0.963975727558136, "kl": 0.013338832184672356, "learning_rate": 3.5799232764803867e-07, "loss": 0.00013410902465693653, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 8602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 418.25, "completions/min_length": 375.0, "epoch": 12.651470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.010607716627418995, "kl": 0.008178303949534893, "learning_rate": 3.5786928408204166e-07, "loss": 8.119912672555074e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 499.75, "completions/min_length": 439.0, "epoch": 12.652941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.8311124444007874, "kl": 0.009557722136378288, "learning_rate": 3.5774624987925655e-07, "loss": 9.515932470094413e-05, "reward": 0.543749988079071, "reward_std": 0.0176776684820652, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.6291528940200806, "step": 8604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 492.0625, "completions/min_length": 423.0, "epoch": 12.654411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.015150278806686401, "kl": 0.009144175914116204, "learning_rate": 3.5762322504778843e-07, "loss": 9.115661669056863e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 448.875, "completions/min_length": 379.0, "epoch": 12.655882352941177, "frac_reward_zero_std": 0.0, "grad_norm": 1.5428550243377686, "kl": 0.014108399511314929, "learning_rate": 3.575002095957419e-07, "loss": 0.00013908743858337402, "reward": 0.768750011920929, "reward_std": 0.4286451041698456, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.704154372215271, "step": 8606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 446.8125, "completions/min_length": 306.0, "epoch": 12.657352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.1155204772949219, "kl": 0.01168575615156442, "learning_rate": 3.57377203531221e-07, "loss": 0.00011542811989784241, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 8607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 476.4375, "completions/min_length": 372.0, "epoch": 12.658823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.8546745181083679, "kl": 0.013048915308900177, "learning_rate": 3.57254206862329e-07, "loss": 0.00012911856174468994, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 8608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 463.5, "completions/min_length": 416.0, "epoch": 12.660294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.020510703325271606, "kl": 0.010351158794946969, "learning_rate": 3.571312195971685e-07, "loss": 0.00010381810716353357, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 457.375, "completions/min_length": 400.0, "epoch": 12.661764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.5026664733886719, "kl": 0.014972062082961202, "learning_rate": 3.570082417438419e-07, "loss": 0.0001497715711593628, "reward": 0.7714166641235352, "reward_std": 0.41201287508010864, "rewards/DrugCombAccuracyCOTORM/mean": 0.7637500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.42547035217285156, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6041666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.8001735806465149, "step": 8610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 478.3125, "completions/min_length": 413.0, "epoch": 12.663235294117648, "frac_reward_zero_std": 0.5, "grad_norm": 0.947955310344696, "kl": 0.011870890855789185, "learning_rate": 3.5688527331045035e-07, "loss": 0.0001180879698949866, "reward": 0.5102500319480896, "reward_std": 0.1661846935749054, "rewards/DrugCombAccuracyCOTORM/mean": 0.3956249952316284, "rewards/DrugCombAccuracyCOTORM/std": 0.486493855714798, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.17078252136707306, "step": 8611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 437.0625, "completions/min_length": 341.0, "epoch": 12.66470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.008455113507807255, "kl": 0.007320526870898902, "learning_rate": 3.5676231430509486e-07, "loss": 7.304689643206075e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 491.5625, "completions/min_length": 407.0, "epoch": 12.666176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.017229488119482994, "kl": 0.007541792350821197, "learning_rate": 3.566393647358755e-07, "loss": 7.509622082579881e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 8613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/mean_length": 541.4375, "completions/min_length": 418.0, "epoch": 12.66764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9097836017608643, "kl": 0.012375782243907452, "learning_rate": 3.5651642461089204e-07, "loss": 0.0001257527619600296, "reward": 0.652999997138977, "reward_std": 0.03111269511282444, "rewards/DrugCombAccuracyCOTORM/mean": 0.5870833396911621, "rewards/DrugCombAccuracyCOTORM/std": 0.42975595593452454, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 8614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 433.1875, "completions/min_length": 389.0, "epoch": 12.669117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.01136423647403717, "kl": 0.008337905397638679, "learning_rate": 3.5639349393824346e-07, "loss": 8.310294651892036e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 451.375, "completions/min_length": 382.0, "epoch": 12.670588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.008804084733128548, "kl": 0.008357372600585222, "learning_rate": 3.562705727260281e-07, "loss": 8.311806595884264e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/mean_length": 378.5625, "completions/min_length": 339.0, "epoch": 12.672058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.00929722748696804, "kl": 0.008080308442004025, "learning_rate": 3.5614766098234366e-07, "loss": 8.0828060163185e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 436.3125, "completions/min_length": 398.0, "epoch": 12.673529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.012578221037983894, "kl": 0.006436234922148287, "learning_rate": 3.560247587152873e-07, "loss": 6.40057769487612e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 433.5625, "completions/min_length": 391.0, "epoch": 12.675, "frac_reward_zero_std": 1.0, "grad_norm": 0.010306010022759438, "kl": 0.010301323374733329, "learning_rate": 3.5590186593295533e-07, "loss": 0.00010288141493219882, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 439.5625, "completions/min_length": 357.0, "epoch": 12.676470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.013011702336370945, "kl": 0.00772606732789427, "learning_rate": 3.5577898264344385e-07, "loss": 7.71472550695762e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 477.125, "completions/min_length": 418.0, "epoch": 12.677941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.012524638324975967, "kl": 0.007657656795345247, "learning_rate": 3.55656108854848e-07, "loss": 7.62943527661264e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 465.75, "completions/min_length": 436.0, "epoch": 12.679411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.8921229839324951, "kl": 0.008742651785723865, "learning_rate": 3.5553324457526255e-07, "loss": 8.693337440490723e-05, "reward": 0.9775428771972656, "reward_std": 0.06351838260889053, "rewards/DrugCombAccuracyCOTORM/mean": 0.971928596496582, "rewards/DrugCombAccuracyCOTORM/std": 0.11228571832180023, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 504.0, "completions/min_length": 426.0, "epoch": 12.680882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.0892608165740967, "kl": 0.01492623402737081, "learning_rate": 3.554103898127812e-07, "loss": 0.00014768540859222412, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 8623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 420.75, "completions/min_length": 362.0, "epoch": 12.68235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9191942811012268, "kl": 0.0077205735724419355, "learning_rate": 3.5528754457549746e-07, "loss": 7.73407518863678e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 8624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 447.625, "completions/min_length": 317.0, "epoch": 12.683823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.0153814554214478, "kl": 0.013052021036855876, "learning_rate": 3.551647088715041e-07, "loss": 0.00013010254770051688, "reward": 0.824999988079071, "reward_std": 0.24348658323287964, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 8625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 434.25, "completions/min_length": 389.0, "epoch": 12.685294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.0225378405302763, "kl": 0.011229493655264378, "learning_rate": 3.5504188270889313e-07, "loss": 0.00011280018952675164, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 444.4375, "completions/min_length": 387.0, "epoch": 12.686764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 0.8639218807220459, "kl": 0.009391813771799207, "learning_rate": 3.549190660957562e-07, "loss": 9.398162364959717e-05, "reward": 0.6464166641235352, "reward_std": 0.1461501270532608, "rewards/DrugCombAccuracyCOTORM/mean": 0.5762500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.49902406334877014, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.17078250646591187, "step": 8627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 475.4375, "completions/min_length": 416.0, "epoch": 12.688235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 0.731685996055603, "kl": 0.008833463885821402, "learning_rate": 3.54796259040184e-07, "loss": 8.810678264126182e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 462.0625, "completions/min_length": 354.0, "epoch": 12.689705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.088344693183899, "kl": 0.008867682656273246, "learning_rate": 3.546734615502668e-07, "loss": 8.966028690338135e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 466.6875, "completions/min_length": 403.0, "epoch": 12.691176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.7610820531845093, "kl": 0.010198731324635446, "learning_rate": 3.5455067363409407e-07, "loss": 0.00010246210149489343, "reward": 0.5062500238418579, "reward_std": 0.0176776684820652, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0625, "rewards/DrugCombCoverageCOTORM/std": 0.9979145526885986, "step": 8630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 445.5625, "completions/min_length": 385.0, "epoch": 12.69264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9407737255096436, "kl": 0.009211611817590892, "learning_rate": 3.544278952997549e-07, "loss": 9.277788922190666e-05, "reward": 0.65625, "reward_std": 0.21286731958389282, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.6291528940200806, "step": 8631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 446.625, "completions/min_length": 407.0, "epoch": 12.694117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.8532906770706177, "kl": 0.012416428420692682, "learning_rate": 3.5430512655533767e-07, "loss": 0.00012404739391058683, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 395.1875, "completions/min_length": 326.0, "epoch": 12.695588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.012572068721055984, "kl": 0.0075728787342086434, "learning_rate": 3.5418236740892995e-07, "loss": 7.552095485152677e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/mean_length": 467.6875, "completions/min_length": 362.0, "epoch": 12.697058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.9506686925888062, "kl": 0.01192223560065031, "learning_rate": 3.540596178686189e-07, "loss": 0.00011869846639456227, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 729.0, "completions/mean_length": 548.0625, "completions/min_length": 400.0, "epoch": 12.698529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 0.8881452679634094, "kl": 0.007412772160023451, "learning_rate": 3.5393687794249087e-07, "loss": 7.335469126701355e-05, "reward": 0.8636393547058105, "reward_std": 0.17464342713356018, "rewards/DrugCombAccuracyCOTORM/mean": 0.8371012806892395, "rewards/DrugCombAccuracyCOTORM/std": 0.3295484185218811, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9395833015441895, "rewards/DrugCombCoverageCOTORM/std": 0.1569235622882843, "step": 8635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 407.0, "completions/min_length": 338.0, "epoch": 12.7, "frac_reward_zero_std": 1.0, "grad_norm": 0.023804830387234688, "kl": 0.008152202935889363, "learning_rate": 3.5381414763863163e-07, "loss": 8.200186857720837e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 467.25, "completions/min_length": 423.0, "epoch": 12.701470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.5537257194519043, "kl": 0.011682689189910889, "learning_rate": 3.5369142696512646e-07, "loss": 0.00011690706014633179, "reward": 0.612250030040741, "reward_std": 0.36439549922943115, "rewards/DrugCombAccuracyCOTORM/mean": 0.5309374928474426, "rewards/DrugCombAccuracyCOTORM/std": 0.48794543743133545, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.22360680997371674, "step": 8637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 452.5625, "completions/min_length": 366.0, "epoch": 12.702941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.05640368163585663, "kl": 0.010058677988126874, "learning_rate": 3.5356871593005976e-07, "loss": 9.819668775890023e-05, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 426.0625, "completions/min_length": 326.0, "epoch": 12.704411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.015519404783844948, "kl": 0.009221497341059148, "learning_rate": 3.534460145415155e-07, "loss": 9.188729745801538e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 460.625, "completions/min_length": 376.0, "epoch": 12.705882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.9721735715866089, "kl": 0.012048295699059963, "learning_rate": 3.5332332280757706e-07, "loss": 0.00011957436800003052, "reward": 0.4937500059604645, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 8640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 429.0625, "completions/min_length": 356.0, "epoch": 12.70735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.7479197382926941, "kl": 0.009481600718572736, "learning_rate": 3.532006407363267e-07, "loss": 9.474557737121359e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 435.3125, "completions/min_length": 389.0, "epoch": 12.708823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.1885478496551514, "kl": 0.011371295433491468, "learning_rate": 3.5307796833584673e-07, "loss": 0.00011406283010728657, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 451.9375, "completions/min_length": 388.0, "epoch": 12.71029411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9821764230728149, "kl": 0.011606213403865695, "learning_rate": 3.529553056142184e-07, "loss": 0.00011504441499710083, "reward": 0.8439583778381348, "reward_std": 0.02180246263742447, "rewards/DrugCombAccuracyCOTORM/mean": 0.8229166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.18726837635040283, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.856249988079071, "rewards/DrugCombCoverageCOTORM/std": 0.15041610598564148, "step": 8643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 444.4375, "completions/min_length": 409.0, "epoch": 12.711764705882352, "frac_reward_zero_std": 1.0, "grad_norm": 0.011157931759953499, "kl": 0.007653865497559309, "learning_rate": 3.528326525795223e-07, "loss": 7.636045484105125e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/mean_length": 490.25, "completions/min_length": 396.0, "epoch": 12.713235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8610860109329224, "kl": 0.008935397258028388, "learning_rate": 3.527100092398387e-07, "loss": 9.071826934814453e-05, "reward": 0.8458333611488342, "reward_std": 0.21962270140647888, "rewards/DrugCombAccuracyCOTORM/mean": 0.8385416865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.34219980239868164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 8645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 522.0, "completions/min_length": 442.0, "epoch": 12.714705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9286288619041443, "kl": 0.009325579041615129, "learning_rate": 3.525873756032471e-07, "loss": 9.316539217252284e-05, "reward": 0.7105500102043152, "reward_std": 0.1610409915447235, "rewards/DrugCombAccuracyCOTORM/mean": 0.6772500276565552, "rewards/DrugCombAccuracyCOTORM/std": 0.39889824390411377, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.6718547940254211, "step": 8646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/mean_length": 473.5, "completions/min_length": 347.0, "epoch": 12.716176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.0438252687454224, "kl": 0.010198687668889761, "learning_rate": 3.52464751677826e-07, "loss": 0.00010229647159576416, "reward": 0.5922999978065491, "reward_std": 0.07028113305568695, "rewards/DrugCombAccuracyCOTORM/mean": 0.5476666688919067, "rewards/DrugCombAccuracyCOTORM/std": 0.47260037064552307, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5416666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.7781745791435242, "step": 8647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 480.375, "completions/min_length": 381.0, "epoch": 12.717647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.1428513526916504, "kl": 0.006954236654564738, "learning_rate": 3.5234213747165363e-07, "loss": 7.022172212600708e-05, "reward": 0.9750000238418579, "reward_std": 0.0707106739282608, "rewards/DrugCombAccuracyCOTORM/mean": 0.96875, "rewards/DrugCombAccuracyCOTORM/std": 0.125, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 510.4375, "completions/min_length": 442.0, "epoch": 12.719117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.8973928093910217, "kl": 0.007752444944344461, "learning_rate": 3.522195329928076e-07, "loss": 7.659133552806452e-05, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 8649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 420.1875, "completions/min_length": 364.0, "epoch": 12.720588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.012699722312390804, "kl": 0.005761948530562222, "learning_rate": 3.520969382493648e-07, "loss": 5.846068961545825e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 504.1875, "completions/min_length": 425.0, "epoch": 12.722058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.8373205065727234, "kl": 0.01089231506921351, "learning_rate": 3.519743532494014e-07, "loss": 0.00010734051465988159, "reward": 0.7250000238418579, "reward_std": 0.18322508037090302, "rewards/DrugCombAccuracyCOTORM/mean": 0.65625, "rewards/DrugCombAccuracyCOTORM/std": 0.4732423722743988, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 466.875, "completions/min_length": 414.0, "epoch": 12.723529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.019930504262447357, "kl": 0.0099870819831267, "learning_rate": 3.518517780009931e-07, "loss": 0.00010025636584032327, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 433.75, "completions/min_length": 373.0, "epoch": 12.725, "frac_reward_zero_std": 0.5, "grad_norm": 0.9509378671646118, "kl": 0.012015907792374492, "learning_rate": 3.517292125122145e-07, "loss": 0.0001199189864564687, "reward": 0.949999988079071, "reward_std": 0.0690065398812294, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.13437095284461975, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 454.8125, "completions/min_length": 396.0, "epoch": 12.726470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.01603107340633869, "kl": 0.01189441978931427, "learning_rate": 3.516066567911403e-07, "loss": 0.00011827389243990183, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 438.9375, "completions/min_length": 374.0, "epoch": 12.727941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 1.1130220890045166, "kl": 0.010357393883168697, "learning_rate": 3.514841108458441e-07, "loss": 0.00010391324758529663, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 461.9375, "completions/min_length": 408.0, "epoch": 12.729411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.008949077688157558, "kl": 0.006449862150475383, "learning_rate": 3.5136157468439863e-07, "loss": 6.405139720300213e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 517.125, "completions/min_length": 474.0, "epoch": 12.730882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 2.074260950088501, "kl": 0.014497749623842537, "learning_rate": 3.512390483148766e-07, "loss": 0.0001482511288486421, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 452.1875, "completions/min_length": 374.0, "epoch": 12.73235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.00015389919281, "kl": 0.009714256273582578, "learning_rate": 3.5111653174534966e-07, "loss": 9.700350346975029e-05, "reward": 0.8885208368301392, "reward_std": 0.07030406594276428, "rewards/DrugCombAccuracyCOTORM/mean": 0.8671614527702332, "rewards/DrugCombAccuracyCOTORM/std": 0.17956797778606415, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9479166269302368, "rewards/DrugCombCoverageCOTORM/std": 0.07978560030460358, "step": 8658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 443.4375, "completions/min_length": 392.0, "epoch": 12.733823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.8620213866233826, "kl": 0.0098230802686885, "learning_rate": 3.5099402498388873e-07, "loss": 9.723007678985596e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 8659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 447.125, "completions/min_length": 406.0, "epoch": 12.735294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.010422022081911564, "kl": 0.006937582395039499, "learning_rate": 3.5087152803856435e-07, "loss": 6.932139513082802e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 467.5, "completions/min_length": 401.0, "epoch": 12.736764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9878421425819397, "kl": 0.018242684076540172, "learning_rate": 3.5074904091744627e-07, "loss": 0.0001839939213823527, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 462.375, "completions/min_length": 381.0, "epoch": 12.738235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9210236668586731, "kl": 0.010412523988634348, "learning_rate": 3.5062656362860373e-07, "loss": 0.00010439414472784847, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 472.9375, "completions/min_length": 413.0, "epoch": 12.739705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.1005767583847046, "kl": 0.015137535985559225, "learning_rate": 3.5050409618010515e-07, "loss": 0.00015227682888507843, "reward": 0.6849583387374878, "reward_std": 0.11712367087602615, "rewards/DrugCombAccuracyCOTORM/mean": 0.628333330154419, "rewards/DrugCombAccuracyCOTORM/std": 0.4374505281448364, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8229166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.23935678601264954, "step": 8663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 419.9375, "completions/min_length": 353.0, "epoch": 12.741176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.039102282375097275, "kl": 0.009382387739606202, "learning_rate": 3.5038163858001847e-07, "loss": 9.247846901416779e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 471.875, "completions/min_length": 373.0, "epoch": 12.742647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.009804817847907543, "kl": 0.00877516926266253, "learning_rate": 3.5025919083641067e-07, "loss": 8.659374725539237e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 440.0625, "completions/min_length": 389.0, "epoch": 12.744117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.022580094635486603, "kl": 0.009493582765571773, "learning_rate": 3.5013675295734846e-07, "loss": 9.491012315265834e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 416.625, "completions/min_length": 368.0, "epoch": 12.745588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.016766943037509918, "kl": 0.011540996260009706, "learning_rate": 3.5001432495089767e-07, "loss": 0.00011516416270751506, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 498.125, "completions/min_length": 442.0, "epoch": 12.74705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.010740326717495918, "kl": 0.008114474709145725, "learning_rate": 3.4989190682512364e-07, "loss": 8.111730858217925e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 450.3125, "completions/min_length": 406.0, "epoch": 12.748529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.018411625176668167, "kl": 0.008963547530584037, "learning_rate": 3.497694985880909e-07, "loss": 8.926158625399694e-05, "reward": 0.6713333129882812, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6100000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.40279027819633484, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 8669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 475.0625, "completions/min_length": 431.0, "epoch": 12.75, "frac_reward_zero_std": 1.0, "grad_norm": 0.015101052820682526, "kl": 0.008652560762129724, "learning_rate": 3.4964710024786347e-07, "loss": 8.613381942268461e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 460.25, "completions/min_length": 420.0, "epoch": 12.751470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.01005755178630352, "kl": 0.006602472625672817, "learning_rate": 3.4952471181250456e-07, "loss": 6.627332186326385e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 481.5, "completions/min_length": 411.0, "epoch": 12.75294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.0334280729293823, "kl": 0.009864605264738202, "learning_rate": 3.494023332900768e-07, "loss": 9.991669503506273e-05, "reward": 0.6656249761581421, "reward_std": 0.11175347864627838, "rewards/DrugCombAccuracyCOTORM/mean": 0.59375, "rewards/DrugCombAccuracyCOTORM/std": 0.4552929699420929, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.20155644416809082, "step": 8672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/mean_length": 488.6875, "completions/min_length": 433.0, "epoch": 12.754411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9043217897415161, "kl": 0.008453986956737936, "learning_rate": 3.4927996468864217e-07, "loss": 8.498595707351342e-05, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 8673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 488.375, "completions/min_length": 450.0, "epoch": 12.755882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.021665794774889946, "kl": 0.01315947133116424, "learning_rate": 3.491576060162621e-07, "loss": 0.0001315883273491636, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 499.0, "completions/min_length": 423.0, "epoch": 12.757352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.8104186058044434, "kl": 0.007820862927474082, "learning_rate": 3.4903525728099715e-07, "loss": 7.859617471694946e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 535.4375, "completions/min_length": 491.0, "epoch": 12.758823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 1.4789162874221802, "kl": 0.021362459752708673, "learning_rate": 3.4891291849090754e-07, "loss": 0.00021379068493843079, "reward": 0.663917601108551, "reward_std": 0.12792758643627167, "rewards/DrugCombAccuracyCOTORM/mean": 0.6235167384147644, "rewards/DrugCombAccuracyCOTORM/std": 0.26591843366622925, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6510416269302368, "rewards/DrugCombCoverageCOTORM/std": 0.4586804509162903, "step": 8676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 398.4375, "completions/min_length": 336.0, "epoch": 12.760294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.030539710074663162, "kl": 0.009674866450950503, "learning_rate": 3.487905896540524e-07, "loss": 9.661330841481686e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 475.0625, "completions/min_length": 417.0, "epoch": 12.761764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.52967369556427, "kl": 0.011894137831404805, "learning_rate": 3.4866827077849055e-07, "loss": 0.00011850893497467041, "reward": 0.8177083730697632, "reward_std": 0.2823638916015625, "rewards/DrugCombAccuracyCOTORM/mean": 0.7916666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.4013864994049072, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.5072392821311951, "step": 8678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 469.375, "completions/min_length": 390.0, "epoch": 12.763235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0531824827194214, "kl": 0.009333326946943998, "learning_rate": 3.4854596187228003e-07, "loss": 9.360401600133628e-05, "reward": 0.6371666789054871, "reward_std": 0.16668714582920074, "rewards/DrugCombAccuracyCOTORM/mean": 0.6037499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.4699627757072449, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5416666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.7781745791435242, "step": 8679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 449.1875, "completions/min_length": 377.0, "epoch": 12.764705882352942, "frac_reward_zero_std": 0.0, "grad_norm": 1.1403212547302246, "kl": 0.00921723386272788, "learning_rate": 3.484236629434782e-07, "loss": 9.228289127349854e-05, "reward": 0.7630000114440918, "reward_std": 0.21438458561897278, "rewards/DrugCombAccuracyCOTORM/mean": 0.7141666412353516, "rewards/DrugCombAccuracyCOTORM/std": 0.3222731947898865, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.08606630563735962, "step": 8680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 451.125, "completions/min_length": 390.0, "epoch": 12.766176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9902714490890503, "kl": 0.010168394888751209, "learning_rate": 3.4830137400014206e-07, "loss": 0.00010144710540771484, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 474.6875, "completions/min_length": 409.0, "epoch": 12.76764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0016826391220093, "kl": 0.006227288627997041, "learning_rate": 3.481790950503274e-07, "loss": 6.233685417100787e-05, "reward": 0.8806500434875488, "reward_std": 0.16521817445755005, "rewards/DrugCombAccuracyCOTORM/mean": 0.857285737991333, "rewards/DrugCombAccuracyCOTORM/std": 0.30723828077316284, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9482142925262451, "rewards/DrugCombCoverageCOTORM/std": 0.12131647765636444, "step": 8682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 440.0, "completions/min_length": 369.0, "epoch": 12.769117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.008298548869788647, "kl": 0.006776198511943221, "learning_rate": 3.4805682610208974e-07, "loss": 6.798025424359366e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 445.125, "completions/min_length": 393.0, "epoch": 12.770588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.7959861159324646, "kl": 0.010962170781567693, "learning_rate": 3.4793456716348383e-07, "loss": 0.00010947883129119873, "reward": 0.882437527179718, "reward_std": 0.16225165128707886, "rewards/DrugCombAccuracyCOTORM/mean": 0.8589062690734863, "rewards/DrugCombAccuracyCOTORM/std": 0.30334246158599854, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.953125, "rewards/DrugCombCoverageCOTORM/std": 0.10077822208404541, "step": 8684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 462.625, "completions/min_length": 383.0, "epoch": 12.772058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 0.9758458733558655, "kl": 0.008687167894095182, "learning_rate": 3.4781231824256385e-07, "loss": 8.742943464312702e-05, "reward": 0.25, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.125, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 8685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 457.75, "completions/min_length": 396.0, "epoch": 12.773529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.011023170314729214, "kl": 0.010633980855345726, "learning_rate": 3.476900793473831e-07, "loss": 0.00010613071935949847, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/mean_length": 493.8125, "completions/min_length": 424.0, "epoch": 12.775, "frac_reward_zero_std": 0.5, "grad_norm": 1.2115142345428467, "kl": 0.013738543726503849, "learning_rate": 3.475678504859946e-07, "loss": 0.00013802945613861084, "reward": 0.9056999683380127, "reward_std": 0.17460967600345612, "rewards/DrugCombAccuracyCOTORM/mean": 0.8914999961853027, "rewards/DrugCombAccuracyCOTORM/std": 0.2964784502983093, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.925000011920929, "rewards/DrugCombCoverageCOTORM/std": 0.20493900775909424, "step": 8687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 465.625, "completions/min_length": 408.0, "epoch": 12.776470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.011042208410799503, "kl": 0.008222622331231833, "learning_rate": 3.474456316664504e-07, "loss": 8.197054557967931e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 436.5, "completions/min_length": 378.0, "epoch": 12.777941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.01645367406308651, "kl": 0.009498691535554826, "learning_rate": 3.4732342289680184e-07, "loss": 9.460172441322356e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/mean_length": 506.375, "completions/min_length": 406.0, "epoch": 12.779411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.2376776933670044, "kl": 0.013365271734073758, "learning_rate": 3.4720122418509976e-07, "loss": 0.00013275444507598877, "reward": 0.6879850029945374, "reward_std": 0.13387134671211243, "rewards/DrugCombAccuracyCOTORM/mean": 0.6279500126838684, "rewards/DrugCombAccuracyCOTORM/std": 0.4413161873817444, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.856249988079071, "rewards/DrugCombCoverageCOTORM/std": 0.27317577600479126, "step": 8690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 433.625, "completions/min_length": 352.0, "epoch": 12.780882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.011638889089226723, "kl": 0.008973891730420291, "learning_rate": 3.4707903553939434e-07, "loss": 8.887183503247797e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 444.875, "completions/min_length": 377.0, "epoch": 12.782352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.3949404954910278, "kl": 0.009643672732636333, "learning_rate": 3.4695685696773515e-07, "loss": 9.666250844020396e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 8692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 449.5625, "completions/min_length": 377.0, "epoch": 12.783823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.01917348802089691, "kl": 0.009573457762598991, "learning_rate": 3.468346884781708e-07, "loss": 9.495223639532924e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 470.8125, "completions/min_length": 420.0, "epoch": 12.785294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.026287149637937546, "kl": 0.011900050565600395, "learning_rate": 3.4671253007874974e-07, "loss": 0.00011810632713604718, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/mean_length": 500.8125, "completions/min_length": 437.0, "epoch": 12.786764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234419107437134, "kl": 0.01093067298643291, "learning_rate": 3.4659038177751916e-07, "loss": 0.00010854750871658325, "reward": 0.4389333426952362, "reward_std": 0.39324241876602173, "rewards/DrugCombAccuracyCOTORM/mean": 0.38720834255218506, "rewards/DrugCombAccuracyCOTORM/std": 0.43256035447120667, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.2916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.7993053197860718, "step": 8695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 461.6875, "completions/min_length": 407.0, "epoch": 12.788235294117648, "frac_reward_zero_std": 0.0, "grad_norm": 1.4637542963027954, "kl": 0.011476161773316562, "learning_rate": 3.46468243582526e-07, "loss": 0.00011152401566505432, "reward": 0.7083333730697632, "reward_std": 0.3019161820411682, "rewards/DrugCombAccuracyCOTORM/mean": 0.6354166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.3859512209892273, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 461.8125, "completions/min_length": 408.0, "epoch": 12.78970588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.010920497588813305, "kl": 0.007291028741747141, "learning_rate": 3.463461155018164e-07, "loss": 7.276973337866366e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/mean_length": 486.125, "completions/min_length": 378.0, "epoch": 12.791176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.914421558380127, "kl": 0.009771506884135306, "learning_rate": 3.4622399754343583e-07, "loss": 9.767006849870086e-05, "reward": 0.625, "reward_std": 0.15811389684677124, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 8698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 496.5, "completions/min_length": 433.0, "epoch": 12.79264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0408231019973755, "kl": 0.008464585524052382, "learning_rate": 3.461018897154292e-07, "loss": 8.479904499836266e-05, "reward": 0.9928571581840515, "reward_std": 0.02020304463803768, "rewards/DrugCombAccuracyCOTORM/mean": 0.9910714626312256, "rewards/DrugCombAccuracyCOTORM/std": 0.0357142835855484, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 409.25, "completions/min_length": 339.0, "epoch": 12.794117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.0479453802108765, "kl": 0.008412242867052555, "learning_rate": 3.459797920258407e-07, "loss": 8.387118577957153e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 453.8125, "completions/min_length": 413.0, "epoch": 12.795588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.012906770221889019, "kl": 0.007365053053945303, "learning_rate": 3.458577044827136e-07, "loss": 7.351691601797938e-05, "reward": 0.6865000128746033, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6237499713897705, "rewards/DrugCombAccuracyCOTORM/std": 0.38858935236930847, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.12909944355487823, "step": 8701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 466.125, "completions/min_length": 393.0, "epoch": 12.797058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.01114070788025856, "kl": 0.00602924486156553, "learning_rate": 3.457356270940909e-07, "loss": 6.0080685216235e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/mean_length": 478.375, "completions/min_length": 381.0, "epoch": 12.798529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.010334893129765987, "kl": 0.01351305318530649, "learning_rate": 3.4561355986801456e-07, "loss": 0.00013252635835669935, "reward": 0.7016666531562805, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.637499988079071, "rewards/DrugCombAccuracyCOTORM/std": 0.3743883967399597, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.08606630563735962, "step": 8703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 396.8125, "completions/min_length": 357.0, "epoch": 12.8, "frac_reward_zero_std": 0.5, "grad_norm": 1.1057361364364624, "kl": 0.01355675864033401, "learning_rate": 3.454915028125263e-07, "loss": 0.00013585388660430908, "reward": 0.44999998807907104, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 8704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 399.9375, "completions/min_length": 300.0, "epoch": 12.801470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.015815503895282745, "kl": 0.00861425290349871, "learning_rate": 3.453694559356668e-07, "loss": 8.59084611875005e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 457.25, "completions/min_length": 404.0, "epoch": 12.802941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.014410702511668205, "kl": 0.005903837038204074, "learning_rate": 3.4524741924547635e-07, "loss": 5.937717651249841e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 479.125, "completions/min_length": 413.0, "epoch": 12.804411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.8376122713088989, "kl": 0.013211075216531754, "learning_rate": 3.4512539274999413e-07, "loss": 0.00013262033462524414, "reward": 0.9588750004768372, "reward_std": 0.056757885962724686, "rewards/DrugCombAccuracyCOTORM/mean": 0.9524999856948853, "rewards/DrugCombAccuracyCOTORM/std": 0.10212194174528122, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.06718549132347107, "step": 8707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/mean_length": 514.375, "completions/min_length": 449.0, "epoch": 12.805882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.0197639465332031, "kl": 0.012782285688444972, "learning_rate": 3.450033764572592e-07, "loss": 0.00012941229215357453, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 8708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 426.8125, "completions/min_length": 390.0, "epoch": 12.80735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.008703788742423058, "kl": 0.008384885732084513, "learning_rate": 3.4488137037530944e-07, "loss": 8.258878369815648e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 464.25, "completions/min_length": 360.0, "epoch": 12.808823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.8708295822143555, "kl": 0.009334954316727817, "learning_rate": 3.4475937451218254e-07, "loss": 9.308010339736938e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/mean_length": 494.9375, "completions/min_length": 405.0, "epoch": 12.810294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.5080849528312683, "kl": 0.009315918199717999, "learning_rate": 3.446373888759151e-07, "loss": 9.381771087646484e-05, "reward": 0.8861863017082214, "reward_std": 0.13176089525222778, "rewards/DrugCombAccuracyCOTORM/mean": 0.8671078681945801, "rewards/DrugCombAccuracyCOTORM/std": 0.2626224458217621, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.925000011920929, "rewards/DrugCombCoverageCOTORM/std": 0.13416409492492676, "step": 8711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 431.875, "completions/min_length": 362.0, "epoch": 12.811764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.02742878720164299, "kl": 0.011908384738489985, "learning_rate": 3.445154134745435e-07, "loss": 0.00011940514377783984, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 436.5625, "completions/min_length": 382.0, "epoch": 12.813235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.017749927937984467, "kl": 0.007057967595756054, "learning_rate": 3.4439344831610284e-07, "loss": 7.048808765830472e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 457.625, "completions/min_length": 404.0, "epoch": 12.814705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.8479204177856445, "kl": 0.013786904513835907, "learning_rate": 3.4427149340862803e-07, "loss": 0.000138014554977417, "reward": 0.7507095336914062, "reward_std": 0.1761484295129776, "rewards/DrugCombAccuracyCOTORM/mean": 0.7144285440444946, "rewards/DrugCombAccuracyCOTORM/std": 0.39416685700416565, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 436.75, "completions/min_length": 391.0, "epoch": 12.816176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.3482093811035156, "kl": 0.012797398027032614, "learning_rate": 3.441495487601531e-07, "loss": 0.00012755393981933594, "reward": 0.8374999761581421, "reward_std": 0.3619407117366791, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 475.5, "completions/min_length": 419.0, "epoch": 12.81764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.3379043340682983, "kl": 0.014101059874519706, "learning_rate": 3.440276143787115e-07, "loss": 0.0001414567232131958, "reward": 0.5681291818618774, "reward_std": 0.3397465944290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.4793020784854889, "rewards/DrugCombAccuracyCOTORM/std": 0.47606271505355835, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.846875011920929, "rewards/DrugCombCoverageCOTORM/std": 0.49848729372024536, "step": 8716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/mean_length": 511.5625, "completions/min_length": 411.0, "epoch": 12.819117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.8552606701850891, "kl": 0.008866691845469177, "learning_rate": 3.4390569027233596e-07, "loss": 8.81509404280223e-05, "reward": 0.960812509059906, "reward_std": 0.11083899438381195, "rewards/DrugCombAccuracyCOTORM/mean": 0.9529687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.18812499940395355, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.0625, "step": 8717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 416.125, "completions/min_length": 353.0, "epoch": 12.820588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.1703476905822754, "kl": 0.010176615323871374, "learning_rate": 3.4378377644905866e-07, "loss": 0.0001021176140056923, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 8718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 414.875, "completions/min_length": 343.0, "epoch": 12.822058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.014535768888890743, "kl": 0.007505204062908888, "learning_rate": 3.436618729169107e-07, "loss": 7.497033220715821e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 465.25, "completions/min_length": 397.0, "epoch": 12.823529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.011662897653877735, "kl": 0.007639434654265642, "learning_rate": 3.435399796839229e-07, "loss": 7.62010968173854e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 448.75, "completions/min_length": 413.0, "epoch": 12.825, "frac_reward_zero_std": 1.0, "grad_norm": 0.01649836264550686, "kl": 0.005910815380048007, "learning_rate": 3.4341809675812526e-07, "loss": 5.924329161643982e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 437.3125, "completions/min_length": 376.0, "epoch": 12.826470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.008641078136861324, "kl": 0.007301451172679663, "learning_rate": 3.4329622414754723e-07, "loss": 7.323981844820082e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 424.1875, "completions/min_length": 374.0, "epoch": 12.827941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.007778223138302565, "kl": 0.005903503391891718, "learning_rate": 3.431743618602173e-07, "loss": 5.8634883316699415e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 423.5, "completions/min_length": 387.0, "epoch": 12.829411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.02463814988732338, "kl": 0.008517792099155486, "learning_rate": 3.4305250990416367e-07, "loss": 8.408346911892295e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 473.9375, "completions/min_length": 405.0, "epoch": 12.830882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.01185426115989685, "kl": 0.008211134234443307, "learning_rate": 3.4293066828741337e-07, "loss": 8.201396121876314e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 412.5625, "completions/min_length": 357.0, "epoch": 12.83235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.012133048847317696, "kl": 0.00836072931997478, "learning_rate": 3.428088370179932e-07, "loss": 8.29763594083488e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 435.0625, "completions/min_length": 363.0, "epoch": 12.833823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.013027946464717388, "kl": 0.008115537697449327, "learning_rate": 3.42687016103929e-07, "loss": 8.050176984397694e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 491.625, "completions/min_length": 452.0, "epoch": 12.83529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.011968705803155899, "kl": 0.008098767139017582, "learning_rate": 3.425652055532461e-07, "loss": 8.127035107463598e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/mean_length": 422.0, "completions/min_length": 385.0, "epoch": 12.836764705882352, "frac_reward_zero_std": 1.0, "grad_norm": 0.008451750501990318, "kl": 0.006691931397654116, "learning_rate": 3.424434053739691e-07, "loss": 6.687322456855327e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 444.375, "completions/min_length": 393.0, "epoch": 12.838235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.104214072227478, "kl": 0.00885215902235359, "learning_rate": 3.423216155741218e-07, "loss": 8.939472900237888e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 488.75, "completions/min_length": 339.0, "epoch": 12.839705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.0607250928878784, "kl": 0.012109609553590417, "learning_rate": 3.4219983616172736e-07, "loss": 0.00011926889419555664, "reward": 0.8552083373069763, "reward_std": 0.09051631391048431, "rewards/DrugCombAccuracyCOTORM/mean": 0.8229166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.23935678601264954, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 8731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/mean_length": 428.875, "completions/min_length": 319.0, "epoch": 12.841176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.763420581817627, "kl": 0.008199902134947479, "learning_rate": 3.420780671448084e-07, "loss": 8.195638656616211e-05, "reward": 0.5958333015441895, "reward_std": 0.0117851123213768, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.1666666567325592, "step": 8732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 449.0, "completions/min_length": 374.0, "epoch": 12.842647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.04349995777010918, "kl": 0.010978683130815625, "learning_rate": 3.4195630853138673e-07, "loss": 0.0001110329685616307, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 403.5625, "completions/min_length": 328.0, "epoch": 12.844117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.018923865631222725, "kl": 0.011848499532788992, "learning_rate": 3.4183456032948354e-07, "loss": 0.0001174889548565261, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 451.1875, "completions/min_length": 381.0, "epoch": 12.845588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.7584856152534485, "kl": 0.009846876375377178, "learning_rate": 3.417128225471193e-07, "loss": 9.957258589565754e-05, "reward": 0.949999988079071, "reward_std": 0.10690449178218842, "rewards/DrugCombAccuracyCOTORM/mean": 0.96875, "rewards/DrugCombAccuracyCOTORM/std": 0.125, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 8735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/mean_length": 390.375, "completions/min_length": 345.0, "epoch": 12.847058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.04087723419070244, "kl": 0.012228799983859062, "learning_rate": 3.415910951923137e-07, "loss": 0.00012336252257227898, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 8736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 436.25, "completions/min_length": 397.0, "epoch": 12.848529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.01964368298649788, "kl": 0.011069504427723587, "learning_rate": 3.4146937827308577e-07, "loss": 0.00010989278962370008, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 446.0, "completions/min_length": 398.0, "epoch": 12.85, "frac_reward_zero_std": 1.0, "grad_norm": 0.02525983192026615, "kl": 0.008692344417795539, "learning_rate": 3.41347671797454e-07, "loss": 8.735392475500703e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 482.75, "completions/min_length": 439.0, "epoch": 12.851470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.7886273860931396, "kl": 0.009059487492777407, "learning_rate": 3.4122597577343616e-07, "loss": 8.946657180786133e-05, "reward": 0.7534999847412109, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.7074999809265137, "rewards/DrugCombAccuracyCOTORM/std": 0.39000001549720764, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.1666666567325592, "step": 8739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/mean_length": 488.375, "completions/min_length": 376.0, "epoch": 12.852941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 1.048044204711914, "kl": 0.011724486015737057, "learning_rate": 3.4110429020904916e-07, "loss": 0.00011774944141507149, "reward": 0.6696428656578064, "reward_std": 0.19112031161785126, "rewards/DrugCombAccuracyCOTORM/mean": 0.6339285373687744, "rewards/DrugCombAccuracyCOTORM/std": 0.4635525643825531, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 8740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 438.3125, "completions/min_length": 372.0, "epoch": 12.854411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.5872362852096558, "kl": 0.010562821757048368, "learning_rate": 3.409826151123096e-07, "loss": 0.00010497123003005981, "reward": 0.7749999761581421, "reward_std": 0.3919961452484131, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 8741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 494.5, "completions/min_length": 408.0, "epoch": 12.855882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.8968614935874939, "kl": 0.011761972680687904, "learning_rate": 3.408609504912329e-07, "loss": 0.00011584814637899399, "reward": 0.9052083492279053, "reward_std": 0.10225021839141846, "rewards/DrugCombAccuracyCOTORM/mean": 0.8854166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.2083333432674408, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 8742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/mean_length": 496.4375, "completions/min_length": 381.0, "epoch": 12.85735294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.3554044961929321, "kl": 0.014042943716049194, "learning_rate": 3.4073929635383394e-07, "loss": 0.00014287978410720825, "reward": 0.684532642364502, "reward_std": 0.3768308162689209, "rewards/DrugCombAccuracyCOTORM/mean": 0.6447282433509827, "rewards/DrugCombAccuracyCOTORM/std": 0.4760270118713379, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4425306022167206, "step": 8743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 403.625, "completions/min_length": 341.0, "epoch": 12.858823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.012629720382392406, "kl": 0.008480032440274954, "learning_rate": 3.4061765270812706e-07, "loss": 8.387923298869282e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 462.4375, "completions/min_length": 391.0, "epoch": 12.860294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.015277266502380371, "kl": 0.008095488185063004, "learning_rate": 3.404960195621259e-07, "loss": 8.113087096717209e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 488.0625, "completions/min_length": 454.0, "epoch": 12.861764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.7982423305511475, "kl": 0.011113690095953643, "learning_rate": 3.403743969238434e-07, "loss": 0.00011055970389861614, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 410.375, "completions/min_length": 369.0, "epoch": 12.863235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.7715876698493958, "kl": 0.0070316685596480966, "learning_rate": 3.402527848012917e-07, "loss": 7.04999765730463e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/mean_length": 518.125, "completions/min_length": 389.0, "epoch": 12.864705882352942, "frac_reward_zero_std": 0.0, "grad_norm": 1.2685233354568481, "kl": 0.010657078819349408, "learning_rate": 3.4013118320248213e-07, "loss": 0.00010574609041213989, "reward": 0.9052083492279053, "reward_std": 0.22510364651679993, "rewards/DrugCombAccuracyCOTORM/mean": 0.8854166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.2770128548145294, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 8748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 486.1875, "completions/min_length": 421.0, "epoch": 12.866176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.6099658012390137, "kl": 0.011857803212478757, "learning_rate": 3.400095921354257e-07, "loss": 0.00011914968490600586, "reward": 0.7749999761581421, "reward_std": 0.4183265268802643, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.5773502588272095, "step": 8749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 456.3125, "completions/min_length": 426.0, "epoch": 12.867647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.2808047533035278, "kl": 0.008169321110472083, "learning_rate": 3.398880116081325e-07, "loss": 8.136779069900513e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 8750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 432.125, "completions/min_length": 384.0, "epoch": 12.869117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.009411283768713474, "kl": 0.008389716618694365, "learning_rate": 3.397664416286118e-07, "loss": 8.423833787674084e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 459.5, "completions/min_length": 422.0, "epoch": 12.870588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.5814650058746338, "kl": 0.010556690394878387, "learning_rate": 3.3964488220487246e-07, "loss": 0.00010494361049495637, "reward": 0.7124166488647461, "reward_std": 0.11620121449232101, "rewards/DrugCombAccuracyCOTORM/mean": 0.6587499976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.3996310830116272, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.17078250646591187, "step": 8752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/mean_length": 491.25, "completions/min_length": 414.0, "epoch": 12.87205882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9591211080551147, "kl": 0.009435297455638647, "learning_rate": 3.3952333334492254e-07, "loss": 9.438395500183105e-05, "reward": 0.9354166984558105, "reward_std": 0.09005618840456009, "rewards/DrugCombAccuracyCOTORM/mean": 0.9270833730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.16065549850463867, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.17078252136707306, "step": 8753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 485.375, "completions/min_length": 412.0, "epoch": 12.873529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.8041374683380127, "kl": 0.005977830558549613, "learning_rate": 3.3940179505676924e-07, "loss": 5.953013896942139e-05, "reward": 0.6443333625793457, "reward_std": 0.15101687610149384, "rewards/DrugCombAccuracyCOTORM/mean": 0.5762500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.49902406334877014, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5018484592437744, "step": 8754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 486.875, "completions/min_length": 410.0, "epoch": 12.875, "frac_reward_zero_std": 0.5, "grad_norm": 1.0588910579681396, "kl": 0.010945124318823218, "learning_rate": 3.392802673484193e-07, "loss": 0.00010970234870910645, "reward": 0.921625018119812, "reward_std": 0.14512230455875397, "rewards/DrugCombAccuracyCOTORM/mean": 0.9059374928474426, "rewards/DrugCombAccuracyCOTORM/std": 0.25702768564224243, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.08539126068353653, "step": 8755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/mean_length": 484.9375, "completions/min_length": 403.0, "epoch": 12.876470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.126660943031311, "kl": 0.01114959386177361, "learning_rate": 3.3915875022787854e-07, "loss": 0.00011071788321714848, "reward": 0.8187500238418579, "reward_std": 0.15633133053779602, "rewards/DrugCombAccuracyCOTORM/mean": 0.7916666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.22669117152690887, "step": 8756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 429.75, "completions/min_length": 377.0, "epoch": 12.87794117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.032746754586696625, "kl": 0.006670122034847736, "learning_rate": 3.3903724370315233e-07, "loss": 6.671658775303513e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 491.4375, "completions/min_length": 445.0, "epoch": 12.879411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.4940135478973389, "kl": 0.010249597136862576, "learning_rate": 3.389157477822452e-07, "loss": 0.0001021847128868103, "reward": 0.31672918796539307, "reward_std": 0.2116953432559967, "rewards/DrugCombAccuracyCOTORM/mean": 0.23250000178813934, "rewards/DrugCombAccuracyCOTORM/std": 0.3432038724422455, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3229166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.8310385346412659, "step": 8758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 435.3125, "completions/min_length": 380.0, "epoch": 12.880882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.206419587135315, "kl": 0.010424318257719278, "learning_rate": 3.3879426247316095e-07, "loss": 0.00010502724762773141, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 474.0, "completions/min_length": 436.0, "epoch": 12.882352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.030815796926617622, "kl": 0.00954260618891567, "learning_rate": 3.386727877839027e-07, "loss": 9.595011215424165e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/mean_length": 535.0, "completions/min_length": 404.0, "epoch": 12.883823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.2622913122177124, "kl": 0.012463478604331613, "learning_rate": 3.3855132372247284e-07, "loss": 0.00012492857058532536, "reward": 0.6858541965484619, "reward_std": 0.14205524325370789, "rewards/DrugCombAccuracyCOTORM/mean": 0.6398698091506958, "rewards/DrugCombAccuracyCOTORM/std": 0.43513697385787964, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7395833730697632, "rewards/DrugCombCoverageCOTORM/std": 0.31012991070747375, "step": 8761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 696.0, "completions/mean_length": 500.8125, "completions/min_length": 402.0, "epoch": 12.885294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.9326134324073792, "kl": 0.009273514384403825, "learning_rate": 3.384298702968733e-07, "loss": 9.28882509469986e-05, "reward": 0.7258869409561157, "reward_std": 0.15681959688663483, "rewards/DrugCombAccuracyCOTORM/mean": 0.6794940233230591, "rewards/DrugCombAccuracyCOTORM/std": 0.4143121838569641, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8229166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.2543601393699646, "step": 8762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/mean_length": 464.125, "completions/min_length": 389.0, "epoch": 12.886764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.021049976348877, "kl": 0.010389657225459814, "learning_rate": 3.383084275151049e-07, "loss": 0.00010246038436889648, "reward": 0.5758249759674072, "reward_std": 0.10556522756814957, "rewards/DrugCombAccuracyCOTORM/mean": 0.5322812795639038, "rewards/DrugCombAccuracyCOTORM/std": 0.4989054203033447, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 8763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 415.8125, "completions/min_length": 372.0, "epoch": 12.888235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.009161095134913921, "kl": 0.00573403260204941, "learning_rate": 3.381869953851682e-07, "loss": 5.771899668616243e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 475.9375, "completions/min_length": 433.0, "epoch": 12.889705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 0.7148614525794983, "kl": 0.006822052295319736, "learning_rate": 3.3806557391506287e-07, "loss": 6.810575723648071e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 423.0, "completions/min_length": 369.0, "epoch": 12.891176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.026271825656294823, "kl": 0.010677929036319256, "learning_rate": 3.3794416311278755e-07, "loss": 0.0001067234406946227, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/mean_length": 518.8125, "completions/min_length": 435.0, "epoch": 12.89264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.963431179523468, "kl": 0.007586869294755161, "learning_rate": 3.378227629863407e-07, "loss": 7.604435086250305e-05, "reward": 0.13125000894069672, "reward_std": 0.025877460837364197, "rewards/DrugCombAccuracyCOTORM/mean": 0.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3125, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 8767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/mean_length": 387.3125, "completions/min_length": 353.0, "epoch": 12.894117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.13834232091903687, "kl": 0.011855891323648393, "learning_rate": 3.3770137354371973e-07, "loss": 0.00011805070244008675, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 453.0, "completions/min_length": 388.0, "epoch": 12.895588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9594774842262268, "kl": 0.010203417157754302, "learning_rate": 3.3757999479292164e-07, "loss": 0.00010240823030471802, "reward": 0.5375000238418579, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/mean_length": 483.5, "completions/min_length": 412.0, "epoch": 12.897058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 0.9914288520812988, "kl": 0.010210904642008245, "learning_rate": 3.374586267419424e-07, "loss": 0.00010323524475097656, "reward": 0.6600034832954407, "reward_std": 0.019220897927880287, "rewards/DrugCombAccuracyCOTORM/mean": 0.5997284054756165, "rewards/DrugCombAccuracyCOTORM/std": 0.41408368945121765, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.802207350730896, "rewards/DrugCombCoverageCOTORM/std": 0.2166447788476944, "step": 8770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 474.5, "completions/min_length": 419.0, "epoch": 12.898529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.2056574821472168, "kl": 0.009682763367891312, "learning_rate": 3.3733726939877763e-07, "loss": 9.769201278686523e-05, "reward": 0.8812500238418579, "reward_std": 0.22598718106746674, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 8771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 476.25, "completions/min_length": 434.0, "epoch": 12.9, "frac_reward_zero_std": 0.5, "grad_norm": 1.0626416206359863, "kl": 0.009062158176675439, "learning_rate": 3.3721592277142173e-07, "loss": 9.140372276306152e-05, "reward": 0.960812509059906, "reward_std": 0.11083897948265076, "rewards/DrugCombAccuracyCOTORM/mean": 0.9529687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.18812499940395355, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.0625, "step": 8772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 453.0625, "completions/min_length": 394.0, "epoch": 12.901470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.010971663519740105, "kl": 0.007023995975032449, "learning_rate": 3.37094586867869e-07, "loss": 6.981925980653614e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 444.5625, "completions/min_length": 378.0, "epoch": 12.902941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.0971217155456543, "kl": 0.0107715924968943, "learning_rate": 3.369732616961125e-07, "loss": 0.00010761618614196777, "reward": 0.6067166924476624, "reward_std": 0.1679822951555252, "rewards/DrugCombAccuracyCOTORM/mean": 0.5734999775886536, "rewards/DrugCombAccuracyCOTORM/std": 0.5012344717979431, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.807086706161499, "step": 8774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 434.4375, "completions/min_length": 381.0, "epoch": 12.904411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.00808797124773264, "kl": 0.00622172630392015, "learning_rate": 3.368519472641451e-07, "loss": 6.249708530958742e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/mean_length": 496.5, "completions/min_length": 425.0, "epoch": 12.905882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.4445163011550903, "kl": 0.008139007142744958, "learning_rate": 3.367306435799584e-07, "loss": 8.071586489677429e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 446.5, "completions/min_length": 406.0, "epoch": 12.907352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.022013267502188683, "kl": 0.007216635858640075, "learning_rate": 3.3660935065154385e-07, "loss": 7.267539331223816e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 469.4375, "completions/min_length": 415.0, "epoch": 12.908823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.010342925786972046, "kl": 0.006872838595882058, "learning_rate": 3.364880684868917e-07, "loss": 6.91208551870659e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 422.125, "completions/min_length": 356.0, "epoch": 12.910294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.3045270442962646, "kl": 0.010321506764739752, "learning_rate": 3.3636679709399187e-07, "loss": 0.00010237842798233032, "reward": 0.699999988079071, "reward_std": 0.3484410345554352, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 450.8125, "completions/min_length": 377.0, "epoch": 12.911764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.022752605378627777, "kl": 0.010339280124753714, "learning_rate": 3.362455364808332e-07, "loss": 0.00010665226727724075, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 444.3125, "completions/min_length": 416.0, "epoch": 12.913235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.013023799285292625, "kl": 0.007141965441405773, "learning_rate": 3.3612428665540424e-07, "loss": 7.143827679101378e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 419.3125, "completions/min_length": 375.0, "epoch": 12.91470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.0814601182937622, "kl": 0.00961711211130023, "learning_rate": 3.3600304762569263e-07, "loss": 9.54222196014598e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 440.5, "completions/min_length": 391.0, "epoch": 12.916176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.016109537333250046, "kl": 0.009052456123754382, "learning_rate": 3.358818193996853e-07, "loss": 9.059435978997499e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 421.8125, "completions/min_length": 366.0, "epoch": 12.91764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.006156082730740309, "kl": 0.005272028036415577, "learning_rate": 3.3576060198536814e-07, "loss": 5.2191742724971846e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 459.625, "completions/min_length": 401.0, "epoch": 12.919117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.9541512727737427, "kl": 0.009447183343581855, "learning_rate": 3.3563939539072707e-07, "loss": 9.444968600291759e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 415.0, "completions/min_length": 377.0, "epoch": 12.920588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.044244322925806046, "kl": 0.008676523342728615, "learning_rate": 3.3551819962374655e-07, "loss": 8.670829993207008e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 489.5625, "completions/min_length": 418.0, "epoch": 12.922058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.04500395432114601, "kl": 0.010026449337601662, "learning_rate": 3.353970146924108e-07, "loss": 0.0001008635590551421, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 484.6875, "completions/min_length": 358.0, "epoch": 12.923529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8447043299674988, "kl": 0.011667103040963411, "learning_rate": 3.3527584060470325e-07, "loss": 0.00011386438563931733, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/mean_length": 507.1875, "completions/min_length": 428.0, "epoch": 12.925, "frac_reward_zero_std": 0.0, "grad_norm": 1.19203782081604, "kl": 0.009725093957968056, "learning_rate": 3.3515467736860646e-07, "loss": 9.686499834060669e-05, "reward": 0.3275694251060486, "reward_std": 0.24017255008220673, "rewards/DrugCombAccuracyCOTORM/mean": 0.1875, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7756944894790649, "rewards/DrugCombCoverageCOTORM/std": 0.5087581872940063, "step": 8789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 459.25, "completions/min_length": 372.0, "epoch": 12.926470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 0.9022764563560486, "kl": 0.009816600941121578, "learning_rate": 3.3503352499210235e-07, "loss": 9.707746357889846e-05, "reward": 0.6541666984558105, "reward_std": 0.21001699566841125, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5416666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5288001894950867, "step": 8790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 460.6875, "completions/min_length": 366.0, "epoch": 12.927941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.014631172642111778, "kl": 0.007203153450973332, "learning_rate": 3.349123834831721e-07, "loss": 7.178230589488521e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/mean_length": 492.125, "completions/min_length": 426.0, "epoch": 12.929411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.9413453340530396, "kl": 0.008273431449197233, "learning_rate": 3.3479125284979616e-07, "loss": 8.319686457980424e-05, "reward": 0.8333333730697632, "reward_std": 0.1553286463022232, "rewards/DrugCombAccuracyCOTORM/mean": 0.7916666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 509.625, "completions/min_length": 451.0, "epoch": 12.930882352941177, "frac_reward_zero_std": 0.0, "grad_norm": 1.3050849437713623, "kl": 0.010053649893961847, "learning_rate": 3.346701330999545e-07, "loss": 0.00010066479444503784, "reward": 0.6875, "reward_std": 0.4326514005661011, "rewards/DrugCombAccuracyCOTORM/mean": 0.65625, "rewards/DrugCombAccuracyCOTORM/std": 0.4732423722743988, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 8793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 449.25, "completions/min_length": 407.0, "epoch": 12.93235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0299568176269531, "kl": 0.008254491433035582, "learning_rate": 3.34549024241626e-07, "loss": 8.215720299631357e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 447.125, "completions/min_length": 431.0, "epoch": 12.933823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.9299139976501465, "kl": 0.010297723696567118, "learning_rate": 3.344279262827892e-07, "loss": 0.00010309318895451725, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 8795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/mean_length": 499.4375, "completions/min_length": 327.0, "epoch": 12.935294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.7157341837882996, "kl": 0.00840689626056701, "learning_rate": 3.343068392314216e-07, "loss": 8.357316255569458e-05, "reward": 0.592444121837616, "reward_std": 0.14932583272457123, "rewards/DrugCombAccuracyCOTORM/mean": 0.57091224193573, "rewards/DrugCombAccuracyCOTORM/std": 0.4806670844554901, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3571428656578064, "rewards/DrugCombCoverageCOTORM/std": 0.9461702704429626, "step": 8796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 414.25, "completions/min_length": 350.0, "epoch": 12.936764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 0.8123767971992493, "kl": 0.006804100121371448, "learning_rate": 3.3418576309549993e-07, "loss": 6.749480962753296e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/mean_length": 499.0, "completions/min_length": 396.0, "epoch": 12.938235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 1.4347306489944458, "kl": 0.00959233834873885, "learning_rate": 3.3406469788300067e-07, "loss": 9.755590872373432e-05, "reward": 0.72764652967453, "reward_std": 0.11090733110904694, "rewards/DrugCombAccuracyCOTORM/mean": 0.6725572943687439, "rewards/DrugCombAccuracyCOTORM/std": 0.38384154438972473, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8960069417953491, "rewards/DrugCombCoverageCOTORM/std": 0.13231854140758514, "step": 8798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 438.5625, "completions/min_length": 366.0, "epoch": 12.939705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.910331130027771, "kl": 0.009818310150876641, "learning_rate": 3.339436436018992e-07, "loss": 9.876489639282227e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 8799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/mean_length": 475.3125, "completions/min_length": 361.0, "epoch": 12.941176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.9671806693077087, "kl": 0.009902893099933863, "learning_rate": 3.338226002601702e-07, "loss": 9.98852847260423e-05, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/mean_length": 529.875, "completions/min_length": 458.0, "epoch": 12.94264705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.5910859107971191, "kl": 0.014561640098690987, "learning_rate": 3.337015678657879e-07, "loss": 0.00014724209904670715, "reward": 0.4843713939189911, "reward_std": 0.34446829557418823, "rewards/DrugCombAccuracyCOTORM/mean": 0.4256899356842041, "rewards/DrugCombAccuracyCOTORM/std": 0.3880857825279236, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4381944537162781, "rewards/DrugCombCoverageCOTORM/std": 0.47799256443977356, "step": 8801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/mean_length": 478.5625, "completions/min_length": 390.0, "epoch": 12.944117647058823, "frac_reward_zero_std": 0.0, "grad_norm": 1.320488691329956, "kl": 0.010227216640487313, "learning_rate": 3.335805464267253e-07, "loss": 0.00010181963443756104, "reward": 0.45625001192092896, "reward_std": 0.39101099967956543, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.7274384498596191, "step": 8802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 481.25, "completions/min_length": 421.0, "epoch": 12.945588235294117, "frac_reward_zero_std": 0.0, "grad_norm": 1.441580891609192, "kl": 0.010698299389332533, "learning_rate": 3.3345953595095524e-07, "loss": 0.00010697543621063232, "reward": 0.6020833253860474, "reward_std": 0.20147259533405304, "rewards/DrugCombAccuracyCOTORM/mean": 0.5416666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 8803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 452.125, "completions/min_length": 411.0, "epoch": 12.947058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.00829615630209446, "kl": 0.006996916956268251, "learning_rate": 3.3333853644644944e-07, "loss": 6.982001650612801e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/mean_length": 387.8125, "completions/min_length": 312.0, "epoch": 12.948529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.0131077840924263, "kl": 0.0071799446595832705, "learning_rate": 3.3321754792117905e-07, "loss": 7.312330126296729e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/mean_length": 453.6875, "completions/min_length": 372.0, "epoch": 12.95, "frac_reward_zero_std": 0.5, "grad_norm": 0.8323475122451782, "kl": 0.008618360268883407, "learning_rate": 3.330965703831146e-07, "loss": 8.592948142904788e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 432.5625, "completions/min_length": 395.0, "epoch": 12.951470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.011058876290917397, "kl": 0.011907952721230686, "learning_rate": 3.3297560384022576e-07, "loss": 0.00011995930253760889, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 448.125, "completions/min_length": 392.0, "epoch": 12.952941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.01565605401992798, "kl": 0.00828931957948953, "learning_rate": 3.328546483004814e-07, "loss": 8.279269968625158e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 441.375, "completions/min_length": 408.0, "epoch": 12.954411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.009461396373808384, "kl": 0.0079855922376737, "learning_rate": 3.327337037718498e-07, "loss": 8.0237710790243e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 459.4375, "completions/min_length": 384.0, "epoch": 12.955882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.013697631657123566, "kl": 0.008177563780918717, "learning_rate": 3.326127702622985e-07, "loss": 8.189972140826285e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 445.3125, "completions/min_length": 388.0, "epoch": 12.95735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9344748854637146, "kl": 0.0123442851472646, "learning_rate": 3.324918477797942e-07, "loss": 0.0001221001148223877, "reward": 0.8051249980926514, "reward_std": 0.20876194536685944, "rewards/DrugCombAccuracyCOTORM/mean": 0.7603124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.43035051226615906, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 8811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 457.5625, "completions/min_length": 405.0, "epoch": 12.958823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.01495328638702631, "kl": 0.011297836899757385, "learning_rate": 3.3237093633230316e-07, "loss": 0.00011306599481031299, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 419.4375, "completions/min_length": 376.0, "epoch": 12.96029411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.011118900962173939, "kl": 0.007546510547399521, "learning_rate": 3.322500359277907e-07, "loss": 7.500646461267024e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/mean_length": 472.5, "completions/min_length": 389.0, "epoch": 12.961764705882352, "frac_reward_zero_std": 0.5, "grad_norm": 0.9277189373970032, "kl": 0.010496466420590878, "learning_rate": 3.3212914657422123e-07, "loss": 0.00010503828525543213, "reward": 0.59375, "reward_std": 0.0176776684820652, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 473.125, "completions/min_length": 422.0, "epoch": 12.963235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9170105457305908, "kl": 0.018288310151547194, "learning_rate": 3.320082682795588e-07, "loss": 0.00018975701823364943, "reward": 0.48383331298828125, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.42250001430511475, "rewards/DrugCombAccuracyCOTORM/std": 0.410779744386673, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4583333432674408, "rewards/DrugCombCoverageCOTORM/std": 0.7391185760498047, "step": 8815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 472.5625, "completions/min_length": 401.0, "epoch": 12.964705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.8256635665893555, "kl": 0.011908553075045347, "learning_rate": 3.318874010517665e-07, "loss": 0.00011835886834887788, "reward": 0.824999988079071, "reward_std": 0.24348656833171844, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 8816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 466.3125, "completions/min_length": 413.0, "epoch": 12.966176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.035651758313179016, "kl": 0.009864969761110842, "learning_rate": 3.3176654489880675e-07, "loss": 9.824696462601423e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 458.1875, "completions/min_length": 402.0, "epoch": 12.967647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.8241637349128723, "kl": 0.006756916642189026, "learning_rate": 3.316456998286413e-07, "loss": 6.737187504768372e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 476.8125, "completions/min_length": 434.0, "epoch": 12.969117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.014558346010744572, "kl": 0.008079979219473898, "learning_rate": 3.31524865849231e-07, "loss": 8.059320680331439e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.0, "completions/mean_length": 578.8125, "completions/min_length": 447.0, "epoch": 12.970588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 1.5842952728271484, "kl": 0.009992381557822227, "learning_rate": 3.314040429685363e-07, "loss": 0.00010012835264205933, "reward": 0.6586748361587524, "reward_std": 0.3497520685195923, "rewards/DrugCombAccuracyCOTORM/mean": 0.5785518884658813, "rewards/DrugCombAccuracyCOTORM/std": 0.4381396472454071, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.07453560084104538, "step": 8820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 425.25, "completions/min_length": 351.0, "epoch": 12.972058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.034738555550575256, "kl": 0.007120884722098708, "learning_rate": 3.312832311945165e-07, "loss": 7.091292354743928e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 416.625, "completions/min_length": 360.0, "epoch": 12.973529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.014569169841706753, "kl": 0.007632713532075286, "learning_rate": 3.311624305351307e-07, "loss": 7.637061935383826e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 402.625, "completions/min_length": 365.0, "epoch": 12.975, "frac_reward_zero_std": 1.0, "grad_norm": 0.025567874312400818, "kl": 0.009441079804673791, "learning_rate": 3.310416409983365e-07, "loss": 9.382007556268945e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 461.125, "completions/min_length": 410.0, "epoch": 12.976470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8089278340339661, "kl": 0.007397870416752994, "learning_rate": 3.309208625920914e-07, "loss": 7.409743557218462e-05, "reward": 0.8374999761581421, "reward_std": 0.22638463973999023, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 461.6875, "completions/min_length": 393.0, "epoch": 12.977941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 1.1845591068267822, "kl": 0.009842341998592019, "learning_rate": 3.3080009532435205e-07, "loss": 9.873404633253813e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 8825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 462.3125, "completions/min_length": 392.0, "epoch": 12.979411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.3461982011795044, "kl": 0.019066740875132382, "learning_rate": 3.3067933920307417e-07, "loss": 0.00018855846428778023, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 441.25, "completions/min_length": 386.0, "epoch": 12.980882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.01084235217422247, "kl": 0.008835296612232924, "learning_rate": 3.30558594236213e-07, "loss": 8.865229756338522e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 418.4375, "completions/min_length": 384.0, "epoch": 12.98235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.04034779593348503, "kl": 0.011273809825070202, "learning_rate": 3.3043786043172285e-07, "loss": 0.00011157659173477441, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 456.3125, "completions/min_length": 422.0, "epoch": 12.983823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.018141401931643486, "kl": 0.007394844898954034, "learning_rate": 3.3031713779755733e-07, "loss": 7.376297435257584e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 453.125, "completions/min_length": 415.0, "epoch": 12.985294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.911531388759613, "kl": 0.012478314922191203, "learning_rate": 3.3019642634166925e-07, "loss": 0.0001241937279701233, "reward": 0.8052083253860474, "reward_std": 0.014731401577591896, "rewards/DrugCombAccuracyCOTORM/mean": 0.7604166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.25069350004196167, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 8830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 465.25, "completions/min_length": 426.0, "epoch": 12.986764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.207768201828003, "kl": 0.009040867211297154, "learning_rate": 3.3007572607201094e-07, "loss": 8.990796777652577e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 435.0, "completions/min_length": 377.0, "epoch": 12.988235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8685465455055237, "kl": 0.0102471032878384, "learning_rate": 3.299550369965338e-07, "loss": 0.00010246102465316653, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/mean_length": 506.8125, "completions/min_length": 437.0, "epoch": 12.989705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.3104702234268188, "kl": 0.010818512178957462, "learning_rate": 3.2983435912318847e-07, "loss": 0.00011001527309417725, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 420.6875, "completions/min_length": 374.0, "epoch": 12.991176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.9578303694725037, "kl": 0.009341709199361503, "learning_rate": 3.29713692459925e-07, "loss": 9.189173579216003e-05, "reward": 0.637499988079071, "reward_std": 0.1505940705537796, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 461.3125, "completions/min_length": 359.0, "epoch": 12.992647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.9939382076263428, "kl": 0.008589293458499014, "learning_rate": 3.295930370146925e-07, "loss": 8.60951840877533e-05, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 8835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 510.9375, "completions/min_length": 460.0, "epoch": 12.994117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.9255837798118591, "kl": 0.008023058529943228, "learning_rate": 3.294723927954394e-07, "loss": 8.011609315872192e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 456.625, "completions/min_length": 404.0, "epoch": 12.995588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9528233408927917, "kl": 0.00852804840542376, "learning_rate": 3.2935175981011353e-07, "loss": 8.547306060791016e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 471.6875, "completions/min_length": 416.0, "epoch": 12.99705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.0368919372558594, "kl": 0.011887046042829752, "learning_rate": 3.292311380666619e-07, "loss": 0.00011876970529556274, "reward": 0.53125, "reward_std": 0.025877460837364197, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3125, "rewards/DrugCombCoverageCOTORM/std": 0.7932003140449524, "step": 8838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 463.75, "completions/min_length": 413.0, "epoch": 12.998529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.3513405323028564, "kl": 0.013361604884266853, "learning_rate": 3.291105275730307e-07, "loss": 0.00013078004121780396, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 8839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 424.25, "completions/min_length": 303.0, "epoch": 13.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.011493546888232231, "kl": 0.008355066296644509, "learning_rate": 3.2898992833716563e-07, "loss": 8.207232167478651e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 469.0, "completions/min_length": 401.0, "epoch": 13.001470588235295, "frac_reward_zero_std": 0.0, "grad_norm": 1.537018060684204, "kl": 0.01435210881754756, "learning_rate": 3.288693403670112e-07, "loss": 0.00014401227235794067, "reward": 0.6625000238418579, "reward_std": 0.3919961452484131, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/mean_length": 543.875, "completions/min_length": 417.0, "epoch": 13.00294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.7725290060043335, "kl": 0.009545981884002686, "learning_rate": 3.2874876367051153e-07, "loss": 9.590014815330505e-05, "reward": 0.7605739831924438, "reward_std": 0.02780614234507084, "rewards/DrugCombAccuracyCOTORM/mean": 0.7146063446998596, "rewards/DrugCombAccuracyCOTORM/std": 0.3001008927822113, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8888888955116272, "rewards/DrugCombCoverageCOTORM/std": 0.1814436912536621, "step": 8842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 445.75, "completions/min_length": 353.0, "epoch": 13.004411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.017492322251200676, "kl": 0.009629819542169571, "learning_rate": 3.2862819825560985e-07, "loss": 9.571712871547788e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/mean_length": 415.875, "completions/min_length": 371.0, "epoch": 13.005882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.012074712663888931, "kl": 0.009854661300778389, "learning_rate": 3.285076441302489e-07, "loss": 9.842831059359014e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/mean_length": 482.875, "completions/min_length": 393.0, "epoch": 13.007352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.6140851974487305, "kl": 0.009831373929046094, "learning_rate": 3.2838710130237023e-07, "loss": 9.864568710327148e-05, "reward": 0.6625000238418579, "reward_std": 0.37970849871635437, "rewards/DrugCombAccuracyCOTORM/mean": 0.59375, "rewards/DrugCombAccuracyCOTORM/std": 0.48196646571159363, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 442.1875, "completions/min_length": 331.0, "epoch": 13.008823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.01275841984897852, "kl": 0.008491401094943285, "learning_rate": 3.2826656977991523e-07, "loss": 8.573754166718572e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 458.8125, "completions/min_length": 405.0, "epoch": 13.010294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.013566729612648487, "kl": 0.008252152591012418, "learning_rate": 3.281460495708239e-07, "loss": 8.224058547057211e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 439.875, "completions/min_length": 374.0, "epoch": 13.011764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0459011793136597, "kl": 0.01051240786910057, "learning_rate": 3.280255406830359e-07, "loss": 0.00010508298873901367, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/mean_length": 409.8125, "completions/min_length": 351.0, "epoch": 13.013235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8635507822036743, "kl": 0.008204096695408225, "learning_rate": 3.2790504312449007e-07, "loss": 8.178502321243286e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 459.0, "completions/min_length": 414.0, "epoch": 13.014705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.024663319811224937, "kl": 0.009044640231877565, "learning_rate": 3.2778455690312447e-07, "loss": 9.09226801013574e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 421.5, "completions/min_length": 379.0, "epoch": 13.016176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9212979078292847, "kl": 0.008069732110016048, "learning_rate": 3.2766408202687646e-07, "loss": 8.054523641476408e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/mean_length": 468.3125, "completions/min_length": 349.0, "epoch": 13.01764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0188878774642944, "kl": 0.012039897148497403, "learning_rate": 3.2754361850368266e-07, "loss": 0.00012178346514701843, "reward": 0.8374999761581421, "reward_std": 0.22638463973999023, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 426.4375, "completions/min_length": 359.0, "epoch": 13.019117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 1.0667884349822998, "kl": 0.016414465848356485, "learning_rate": 3.2742316634147874e-07, "loss": 0.00016357749700546265, "reward": 0.3427083492279053, "reward_std": 0.10549373179674149, "rewards/DrugCombAccuracyCOTORM/mean": 0.2916666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.32489314675331116, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.09375, "rewards/DrugCombCoverageCOTORM/std": 1.0036392211914062, "step": 8853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/mean_length": 483.0625, "completions/min_length": 401.0, "epoch": 13.020588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.105704426765442, "kl": 0.010551158338785172, "learning_rate": 3.2730272554819995e-07, "loss": 0.00010482221841812134, "reward": 0.8608125448226929, "reward_std": 0.19394290447235107, "rewards/DrugCombAccuracyCOTORM/mean": 0.8279687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.3735184371471405, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.0625, "step": 8854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 435.125, "completions/min_length": 326.0, "epoch": 13.022058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.055684398859739304, "kl": 0.008892430225387216, "learning_rate": 3.271822961317805e-07, "loss": 8.830578008200973e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 483.75, "completions/min_length": 414.0, "epoch": 13.023529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.04630735516548157, "kl": 0.010721478494815528, "learning_rate": 3.270618781001541e-07, "loss": 0.00010698389814933762, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/mean_length": 394.125, "completions/min_length": 348.0, "epoch": 13.025, "frac_reward_zero_std": 0.5, "grad_norm": 1.1557092666625977, "kl": 0.009373839478939772, "learning_rate": 3.269414714612534e-07, "loss": 9.373594366479665e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 8857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 450.875, "completions/min_length": 397.0, "epoch": 13.026470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 1.4721201658248901, "kl": 0.02913833176717162, "learning_rate": 3.268210762230108e-07, "loss": 0.0002819788351189345, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 459.8125, "completions/min_length": 363.0, "epoch": 13.027941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.009316947311162949, "kl": 0.006089219357818365, "learning_rate": 3.267006923933573e-07, "loss": 6.094381751609035e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 453.1875, "completions/min_length": 379.0, "epoch": 13.029411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.1413458585739136, "kl": 0.008063475717790425, "learning_rate": 3.2658031998022363e-07, "loss": 8.076718950178474e-05, "reward": 0.9833333492279053, "reward_std": 0.047140445560216904, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 455.8125, "completions/min_length": 389.0, "epoch": 13.030882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.9162874221801758, "kl": 0.009891223278827965, "learning_rate": 3.264599589915396e-07, "loss": 9.90554690361023e-05, "reward": 0.942187488079071, "reward_std": 0.16351844370365143, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 443.0, "completions/min_length": 364.0, "epoch": 13.032352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.045325517654419, "kl": 0.010196855990216136, "learning_rate": 3.2633960943523433e-07, "loss": 0.00010280311107635498, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 476.9375, "completions/min_length": 434.0, "epoch": 13.033823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.907894492149353, "kl": 0.02426233608275652, "learning_rate": 3.262192713192361e-07, "loss": 0.00023771077394485474, "reward": 0.7437499761581421, "reward_std": 0.21286733448505402, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 469.0, "completions/min_length": 394.0, "epoch": 13.035294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.01889578066766262, "kl": 0.010319109307602048, "learning_rate": 3.260989446514726e-07, "loss": 0.00010372072574682534, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 479.8125, "completions/min_length": 353.0, "epoch": 13.036764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.184473991394043, "kl": 0.009926836472004652, "learning_rate": 3.2597862943987036e-07, "loss": 9.851949289441109e-05, "reward": 0.7382166385650635, "reward_std": 0.1621885895729065, "rewards/DrugCombAccuracyCOTORM/mean": 0.6909999847412109, "rewards/DrugCombAccuracyCOTORM/std": 0.4125804603099823, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.20069323480129242, "step": 8865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/mean_length": 483.0, "completions/min_length": 397.0, "epoch": 13.038235294117648, "frac_reward_zero_std": 0.5, "grad_norm": 0.9705902934074402, "kl": 0.011426719604060054, "learning_rate": 3.258583256923557e-07, "loss": 0.00011326055391691625, "reward": 0.7833333611488342, "reward_std": 0.2007920891046524, "rewards/DrugCombAccuracyCOTORM/mean": 0.7291666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.4425306022167206, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 450.3125, "completions/min_length": 375.0, "epoch": 13.03970588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.0418548583984375, "kl": 0.009617348201572895, "learning_rate": 3.257380334168538e-07, "loss": 9.710511949378997e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/mean_length": 489.0, "completions/min_length": 390.0, "epoch": 13.041176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9319186210632324, "kl": 0.0077266322914510965, "learning_rate": 3.256177526212893e-07, "loss": 7.825903594493866e-05, "reward": 0.6541666984558105, "reward_std": 0.13562028110027313, "rewards/DrugCombAccuracyCOTORM/mean": 0.5833333134651184, "rewards/DrugCombAccuracyCOTORM/std": 0.4791968762874603, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 485.625, "completions/min_length": 422.0, "epoch": 13.04264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.018067732453346252, "kl": 0.011253283359110355, "learning_rate": 3.254974833135859e-07, "loss": 0.000111443929199595, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/mean_length": 508.375, "completions/min_length": 415.0, "epoch": 13.044117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.012484815903007984, "kl": 0.007461270899511874, "learning_rate": 3.253772255016668e-07, "loss": 7.458919571945444e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 478.6875, "completions/min_length": 431.0, "epoch": 13.045588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.015368424355983734, "kl": 0.007910043117590249, "learning_rate": 3.2525697919345406e-07, "loss": 7.917296170489863e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/mean_length": 395.375, "completions/min_length": 354.0, "epoch": 13.047058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.021980812773108482, "kl": 0.010275133536197245, "learning_rate": 3.251367443968694e-07, "loss": 0.00010243198630632833, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 414.0, "completions/min_length": 384.0, "epoch": 13.048529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.03194295987486839, "kl": 0.0068256446393206716, "learning_rate": 3.250165211198334e-07, "loss": 6.824618321843445e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 497.25, "completions/min_length": 442.0, "epoch": 13.05, "frac_reward_zero_std": 0.5, "grad_norm": 0.9428789615631104, "kl": 0.013239916297607124, "learning_rate": 3.248963093702662e-07, "loss": 0.00013270009367261082, "reward": 0.375, "reward_std": 0.2314550280570984, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.25, "rewards/DrugCombCoverageCOTORM/std": 1.0, "step": 8874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 440.0, "completions/min_length": 375.0, "epoch": 13.051470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.013607150875031948, "kl": 0.007597593008540571, "learning_rate": 3.24776109156087e-07, "loss": 7.59185422793962e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/mean_length": 395.75, "completions/min_length": 360.0, "epoch": 13.052941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.036800552159547806, "kl": 0.009499546024017036, "learning_rate": 3.2465592048521437e-07, "loss": 9.488064097240567e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 421.9375, "completions/min_length": 370.0, "epoch": 13.054411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.00963230524212122, "kl": 0.006832234677858651, "learning_rate": 3.245357433655659e-07, "loss": 6.837243563495576e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 499.8125, "completions/min_length": 434.0, "epoch": 13.055882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.066089391708374, "kl": 0.010877386783249676, "learning_rate": 3.244155778050586e-07, "loss": 0.0001093503087759018, "reward": 0.8553333282470703, "reward_std": 0.20654159784317017, "rewards/DrugCombAccuracyCOTORM/mean": 0.8400000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.3471022844314575, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5018484592437744, "step": 8878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 388.875, "completions/min_length": 317.0, "epoch": 13.05735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.011236936785280704, "kl": 0.007549420814029872, "learning_rate": 3.242954238116087e-07, "loss": 7.532966992584988e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 448.5, "completions/min_length": 392.0, "epoch": 13.058823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.276526927947998, "kl": 0.012467701686546206, "learning_rate": 3.2417528139313157e-07, "loss": 0.00012384355068206787, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 425.5625, "completions/min_length": 372.0, "epoch": 13.060294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.012431987561285496, "kl": 0.006865429459139705, "learning_rate": 3.2405515055754195e-07, "loss": 6.829774065408856e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 373.9375, "completions/min_length": 303.0, "epoch": 13.061764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.01016309205442667, "kl": 0.006238120957277715, "learning_rate": 3.239350313127539e-07, "loss": 6.124802166596055e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 475.5625, "completions/min_length": 399.0, "epoch": 13.063235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 1.0251610279083252, "kl": 0.015487949829548597, "learning_rate": 3.2381492366668026e-07, "loss": 0.00015397369861602783, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 465.0, "completions/min_length": 420.0, "epoch": 13.064705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.8511250019073486, "kl": 0.0127630231436342, "learning_rate": 3.2369482762723366e-07, "loss": 0.00012664473615586758, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 433.8125, "completions/min_length": 357.0, "epoch": 13.066176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.9488424062728882, "kl": 0.009145268006250262, "learning_rate": 3.235747432023256e-07, "loss": 9.177625179290771e-05, "reward": 0.7875000238418579, "reward_std": 0.2295181304216385, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/mean_length": 431.9375, "completions/min_length": 397.0, "epoch": 13.06764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.018673062324524, "kl": 0.011548879090696573, "learning_rate": 3.2345467039986694e-07, "loss": 0.00011477550287963822, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 457.75, "completions/min_length": 410.0, "epoch": 13.069117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.8771410584449768, "kl": 0.007155878120101988, "learning_rate": 3.233346092277679e-07, "loss": 7.127882417989895e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 492.0, "completions/min_length": 395.0, "epoch": 13.070588235294117, "frac_reward_zero_std": 0.0, "grad_norm": 1.4904892444610596, "kl": 0.013552381074987352, "learning_rate": 3.232145596939378e-07, "loss": 0.00013640522956848145, "reward": 0.5672500133514404, "reward_std": 0.16428446769714355, "rewards/DrugCombAccuracyCOTORM/mean": 0.5085416436195374, "rewards/DrugCombAccuracyCOTORM/std": 0.4490615129470825, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6041666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.47482940554618835, "step": 8888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 461.25, "completions/min_length": 417.0, "epoch": 13.072058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.1334120035171509, "kl": 0.01158674550242722, "learning_rate": 3.23094521806285e-07, "loss": 0.00011437928333180025, "reward": 0.675000011920929, "reward_std": 0.10350983589887619, "rewards/DrugCombAccuracyCOTORM/mean": 0.59375, "rewards/DrugCombAccuracyCOTORM/std": 0.4552929699420929, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 463.6875, "completions/min_length": 392.0, "epoch": 13.073529411764707, "frac_reward_zero_std": 0.0, "grad_norm": 1.266197919845581, "kl": 0.012447427259758115, "learning_rate": 3.229744955727174e-07, "loss": 0.00012461841106414795, "reward": 0.75, "reward_std": 0.39218366146087646, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/mean_length": 491.875, "completions/min_length": 381.0, "epoch": 13.075, "frac_reward_zero_std": 0.0, "grad_norm": 1.366455316543579, "kl": 0.011067113606259227, "learning_rate": 3.22854481001142e-07, "loss": 0.00011058524250984192, "reward": 0.762499988079071, "reward_std": 0.4001959264278412, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 8891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 495.125, "completions/min_length": 397.0, "epoch": 13.076470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.3075710535049438, "kl": 0.012969480128958821, "learning_rate": 3.227344780994652e-07, "loss": 0.0001293271780014038, "reward": 0.5898703336715698, "reward_std": 0.32050150632858276, "rewards/DrugCombAccuracyCOTORM/mean": 0.5333448648452759, "rewards/DrugCombAccuracyCOTORM/std": 0.42687496542930603, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6319444179534912, "rewards/DrugCombCoverageCOTORM/std": 0.6881940960884094, "step": 8892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 455.125, "completions/min_length": 408.0, "epoch": 13.077941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.0224195718765259, "kl": 0.011151910177432, "learning_rate": 3.2261448687559227e-07, "loss": 0.00011084973812103271, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 411.6875, "completions/min_length": 363.0, "epoch": 13.079411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.9255572557449341, "kl": 0.0072187898913398385, "learning_rate": 3.2249450733742824e-07, "loss": 7.203221321105957e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 416.0, "completions/min_length": 354.0, "epoch": 13.080882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.1745364665985107, "kl": 0.010340440785512328, "learning_rate": 3.2237453949287666e-07, "loss": 0.00010497123003005981, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/mean_length": 477.125, "completions/min_length": 366.0, "epoch": 13.08235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0186370611190796, "kl": 0.012709362199530005, "learning_rate": 3.22254583349841e-07, "loss": 0.00012708794383797795, "reward": 0.6354166865348816, "reward_std": 0.14892138540744781, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.3435921370983124, "step": 8896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 476.1875, "completions/min_length": 393.0, "epoch": 13.083823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.684333086013794, "kl": 0.011591815389692783, "learning_rate": 3.221346389162235e-07, "loss": 0.0001149191812146455, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/mean_length": 445.5, "completions/min_length": 381.0, "epoch": 13.08529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.792853593826294, "kl": 0.0064781957771629095, "learning_rate": 3.2201470619992586e-07, "loss": 6.470594962593168e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 8898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 456.75, "completions/min_length": 423.0, "epoch": 13.086764705882352, "frac_reward_zero_std": 0.5, "grad_norm": 0.9568247199058533, "kl": 0.007392119034193456, "learning_rate": 3.218947852088489e-07, "loss": 7.364898920059204e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 467.5625, "completions/min_length": 393.0, "epoch": 13.088235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.012771828100085258, "kl": 0.006760142627172172, "learning_rate": 3.21774875950893e-07, "loss": 6.773469795007259e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 445.3125, "completions/min_length": 351.0, "epoch": 13.089705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.8139688968658447, "kl": 0.007935873582027853, "learning_rate": 3.2165497843395703e-07, "loss": 7.944554090499878e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 459.625, "completions/min_length": 412.0, "epoch": 13.091176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.7939208149909973, "kl": 0.011406886857002974, "learning_rate": 3.2153509266593983e-07, "loss": 0.00011298060417175293, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 469.375, "completions/min_length": 386.0, "epoch": 13.092647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.011865278705954552, "kl": 0.008223623735830188, "learning_rate": 3.214152186547391e-07, "loss": 8.279864414362237e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 455.25, "completions/min_length": 408.0, "epoch": 13.094117647058823, "frac_reward_zero_std": 0.0, "grad_norm": 1.3429622650146484, "kl": 0.010076409205794334, "learning_rate": 3.212953564082517e-07, "loss": 0.00010035932064056396, "reward": 0.668749988079071, "reward_std": 0.3905540704727173, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.704154372215271, "step": 8904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 445.6875, "completions/min_length": 404.0, "epoch": 13.095588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.1633650064468384, "kl": 0.009522645850665867, "learning_rate": 3.211755059343741e-07, "loss": 9.530782699584961e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/mean_length": 479.75, "completions/min_length": 359.0, "epoch": 13.097058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.8642625212669373, "kl": 0.01040668087080121, "learning_rate": 3.210556672410016e-07, "loss": 0.00010287867917213589, "reward": 0.7830256223678589, "reward_std": 0.2046276330947876, "rewards/DrugCombAccuracyCOTORM/mean": 0.751242995262146, "rewards/DrugCombAccuracyCOTORM/std": 0.4064215123653412, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8203125, "rewards/DrugCombCoverageCOTORM/std": 0.3534613251686096, "step": 8906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 446.8125, "completions/min_length": 403.0, "epoch": 13.098529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.9337677359580994, "kl": 0.009389485931023955, "learning_rate": 3.2093584033602894e-07, "loss": 9.396961831953377e-05, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 8907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 480.1875, "completions/min_length": 401.0, "epoch": 13.1, "frac_reward_zero_std": 0.5, "grad_norm": 1.1834285259246826, "kl": 0.012002728530205786, "learning_rate": 3.2081602522734985e-07, "loss": 0.00012019056885037571, "reward": 0.4921875, "reward_std": 0.16351844370365143, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 8908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/mean_length": 488.1875, "completions/min_length": 423.0, "epoch": 13.101470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.452171802520752, "kl": 0.013476526597514749, "learning_rate": 3.206962219228576e-07, "loss": 0.00013364851474761963, "reward": 0.5407360792160034, "reward_std": 0.2476700246334076, "rewards/DrugCombAccuracyCOTORM/mean": 0.47992557287216187, "rewards/DrugCombAccuracyCOTORM/std": 0.4221448302268982, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5679563283920288, "rewards/DrugCombCoverageCOTORM/std": 0.6392418742179871, "step": 8909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 424.4375, "completions/min_length": 382.0, "epoch": 13.102941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 1.019934058189392, "kl": 0.008082592394202948, "learning_rate": 3.2057643043044446e-07, "loss": 8.026704745134339e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 437.3125, "completions/min_length": 316.0, "epoch": 13.104411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.0075988853350281715, "kl": 0.006363705615513027, "learning_rate": 3.20456650758002e-07, "loss": 6.366024899762124e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 473.6875, "completions/min_length": 406.0, "epoch": 13.105882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.7755857110023499, "kl": 0.00848577229771763, "learning_rate": 3.203368829134212e-07, "loss": 8.483976125717163e-05, "reward": 0.6458333730697632, "reward_std": 0.14330288767814636, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 8912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/mean_length": 479.9375, "completions/min_length": 410.0, "epoch": 13.10735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.000305414199829, "kl": 0.00938502186909318, "learning_rate": 3.202171269045918e-07, "loss": 9.441797010367736e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 445.4375, "completions/min_length": 385.0, "epoch": 13.108823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.019993092864751816, "kl": 0.009109644684940577, "learning_rate": 3.20097382739403e-07, "loss": 9.133135608863086e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 472.6875, "completions/min_length": 425.0, "epoch": 13.110294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 1.1089437007904053, "kl": 0.014660530490800738, "learning_rate": 3.199776504257434e-07, "loss": 0.00014712443226017058, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 449.375, "completions/min_length": 371.0, "epoch": 13.111764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9989216327667236, "kl": 0.0089096047449857, "learning_rate": 3.1985792997150064e-07, "loss": 8.948147296905518e-05, "reward": 0.574999988079071, "reward_std": 0.04629100486636162, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 8916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 438.8125, "completions/min_length": 384.0, "epoch": 13.113235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.015570082701742649, "kl": 0.010502374381758273, "learning_rate": 3.197382213845615e-07, "loss": 0.00010526021651457995, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/mean_length": 492.25, "completions/min_length": 447.0, "epoch": 13.114705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 0.8577074408531189, "kl": 0.008225114899687469, "learning_rate": 3.1961852467281224e-07, "loss": 8.226931095123291e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 464.625, "completions/min_length": 393.0, "epoch": 13.116176470588234, "frac_reward_zero_std": 0.5, "grad_norm": 1.317169189453125, "kl": 0.013579206308349967, "learning_rate": 3.1949883984413805e-07, "loss": 0.00013650208711624146, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 442.0, "completions/min_length": 388.0, "epoch": 13.117647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.8215652108192444, "kl": 0.007544974330812693, "learning_rate": 3.1937916690642355e-07, "loss": 7.587366417283192e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 471.4375, "completions/min_length": 394.0, "epoch": 13.119117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.883821964263916, "kl": 0.008605485549196601, "learning_rate": 3.1925950586755234e-07, "loss": 8.644103945698589e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 8921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 476.875, "completions/min_length": 401.0, "epoch": 13.120588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.12906533479690552, "kl": 0.011299797683022916, "learning_rate": 3.1913985673540755e-07, "loss": 0.00011244295455981046, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 8922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 449.6875, "completions/min_length": 410.0, "epoch": 13.12205882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.012536329217255116, "kl": 0.007232952048070729, "learning_rate": 3.1902021951787124e-07, "loss": 7.213681237772107e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 427.1875, "completions/min_length": 387.0, "epoch": 13.123529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.1238352060317993, "kl": 0.011278141755610704, "learning_rate": 3.18900594222825e-07, "loss": 0.00011263155465712771, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 455.75, "completions/min_length": 362.0, "epoch": 13.125, "frac_reward_zero_std": 0.5, "grad_norm": 0.949391782283783, "kl": 0.009109015809372067, "learning_rate": 3.187809808581492e-07, "loss": 9.168684482574463e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 457.5, "completions/min_length": 395.0, "epoch": 13.126470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.9900369048118591, "kl": 0.009408051962964237, "learning_rate": 3.186613794317238e-07, "loss": 9.467452764511108e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 424.9375, "completions/min_length": 380.0, "epoch": 13.12794117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.008569451980292797, "kl": 0.007596935029141605, "learning_rate": 3.1854178995142777e-07, "loss": 7.620616088388488e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/mean_length": 539.5625, "completions/min_length": 457.0, "epoch": 13.129411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.7882051467895508, "kl": 0.009335443028248847, "learning_rate": 3.184222124251394e-07, "loss": 9.396829409524798e-05, "reward": 0.7484375238418579, "reward_std": 0.20835641026496887, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.0625, "step": 8928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 490.75, "completions/min_length": 427.0, "epoch": 13.130882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 1.8533625602722168, "kl": 0.01140804449096322, "learning_rate": 3.1830264686073614e-07, "loss": 0.00011349469423294067, "reward": 0.778166651725769, "reward_std": 0.3239409923553467, "rewards/DrugCombAccuracyCOTORM/mean": 0.7487499713897705, "rewards/DrugCombAccuracyCOTORM/std": 0.3821321725845337, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.30731815099716187, "step": 8929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 459.3125, "completions/min_length": 392.0, "epoch": 13.132352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.01424061506986618, "kl": 0.011261282488703728, "learning_rate": 3.181830932660948e-07, "loss": 0.00011241326137678698, "reward": 0.8516666889190674, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.8333333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.17213258147239685, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8500000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.1549193412065506, "step": 8930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 425.3125, "completions/min_length": 350.0, "epoch": 13.133823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 0.8367215394973755, "kl": 0.008734726463444531, "learning_rate": 3.1806355164909114e-07, "loss": 8.746236562728882e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 8931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 496.0, "completions/min_length": 449.0, "epoch": 13.135294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.011622997932136059, "kl": 0.007962475414387882, "learning_rate": 3.1794402201760016e-07, "loss": 7.960241782711819e-05, "reward": 0.872372567653656, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.8592156767845154, "rewards/DrugCombAccuracyCOTORM/std": 0.14540143311023712, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8500000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.1549193412065506, "step": 8932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 438.4375, "completions/min_length": 370.0, "epoch": 13.136764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.020594099536538124, "kl": 0.007837063167244196, "learning_rate": 3.178245043794964e-07, "loss": 7.805642962921411e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 477.5625, "completions/min_length": 371.0, "epoch": 13.138235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9898967742919922, "kl": 0.0100415563210845, "learning_rate": 3.177049987426532e-07, "loss": 9.906291961669922e-05, "reward": 0.887499988079071, "reward_std": 0.21001701056957245, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 459.0625, "completions/min_length": 400.0, "epoch": 13.139705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.018593531101942062, "kl": 0.009932710439898074, "learning_rate": 3.175855051149433e-07, "loss": 9.983571362681687e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 433.375, "completions/min_length": 351.0, "epoch": 13.141176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.2803571224212646, "kl": 0.00999009981751442, "learning_rate": 3.174660235042389e-07, "loss": 9.932368993759155e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 481.625, "completions/min_length": 395.0, "epoch": 13.14264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.013077478855848312, "kl": 0.007704412681050599, "learning_rate": 3.1734655391841073e-07, "loss": 7.762822497170419e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 437.6875, "completions/min_length": 347.0, "epoch": 13.144117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.017150618135929108, "kl": 0.009389055659994483, "learning_rate": 3.1722709636532943e-07, "loss": 9.40360187087208e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/mean_length": 466.4375, "completions/min_length": 362.0, "epoch": 13.145588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.1239652633666992, "kl": 0.009387200931087136, "learning_rate": 3.171076508528644e-07, "loss": 9.369112376589328e-05, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 8939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 473.625, "completions/min_length": 421.0, "epoch": 13.147058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.017641127109527588, "kl": 0.008747148676775396, "learning_rate": 3.169882173888846e-07, "loss": 8.759826596360654e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 477.125, "completions/min_length": 423.0, "epoch": 13.148529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.0642387866973877, "kl": 0.020857537165284157, "learning_rate": 3.1686879598125784e-07, "loss": 0.00021051615476608276, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 432.6875, "completions/min_length": 391.0, "epoch": 13.15, "frac_reward_zero_std": 0.5, "grad_norm": 0.8472117781639099, "kl": 0.008119381149299443, "learning_rate": 3.167493866378514e-07, "loss": 8.211284875869751e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 430.625, "completions/min_length": 273.0, "epoch": 13.151470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.010749940760433674, "kl": 0.007822038372978568, "learning_rate": 3.166299893665315e-07, "loss": 7.77253444539383e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 439.9375, "completions/min_length": 406.0, "epoch": 13.152941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.7963243126869202, "kl": 0.008227137266658247, "learning_rate": 3.1651060417516397e-07, "loss": 8.17030668258667e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 454.75, "completions/min_length": 405.0, "epoch": 13.154411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.010212728753685951, "kl": 0.009117380017414689, "learning_rate": 3.163912310716134e-07, "loss": 9.142271301243454e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 445.625, "completions/min_length": 405.0, "epoch": 13.155882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.014599030837416649, "kl": 0.007528900634497404, "learning_rate": 3.162718700637439e-07, "loss": 7.615257345605642e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 438.875, "completions/min_length": 388.0, "epoch": 13.157352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.015243885107338428, "kl": 0.009065177291631699, "learning_rate": 3.1615252115941866e-07, "loss": 9.112971019931138e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 487.3125, "completions/min_length": 449.0, "epoch": 13.158823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.0295788049697876, "kl": 0.010312578175216913, "learning_rate": 3.160331843665001e-07, "loss": 0.00010203413694398478, "reward": 0.71875, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 8948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 463.375, "completions/min_length": 409.0, "epoch": 13.160294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.01131436601281166, "kl": 0.010386102367192507, "learning_rate": 3.1591385969284976e-07, "loss": 0.00010305101750418544, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 450.0, "completions/min_length": 412.0, "epoch": 13.161764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.009335212409496307, "kl": 0.007788248243741691, "learning_rate": 3.1579454714632846e-07, "loss": 7.795658893883228e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 422.8125, "completions/min_length": 384.0, "epoch": 13.163235294117648, "frac_reward_zero_std": 0.5, "grad_norm": 3.1974565982818604, "kl": 0.07741554267704487, "learning_rate": 3.1567524673479615e-07, "loss": 0.0007620304822921753, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 429.0, "completions/min_length": 369.0, "epoch": 13.16470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.01618853397667408, "kl": 0.009237072430551052, "learning_rate": 3.155559584661123e-07, "loss": 9.274989133700728e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 431.8125, "completions/min_length": 393.0, "epoch": 13.166176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 1.0964181423187256, "kl": 0.00801653612870723, "learning_rate": 3.15436682348135e-07, "loss": 7.984787225723267e-05, "reward": 0.8999999761581421, "reward_std": 0.2828426957130432, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 442.4375, "completions/min_length": 340.0, "epoch": 13.16764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.015627095475792885, "kl": 0.01041757082566619, "learning_rate": 3.1531741838872205e-07, "loss": 0.00010133342584595084, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 425.6875, "completions/min_length": 376.0, "epoch": 13.169117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.01023355033248663, "kl": 0.008144013700075448, "learning_rate": 3.151981665957302e-07, "loss": 8.137075928971171e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 480.875, "completions/min_length": 388.0, "epoch": 13.170588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.021357938647270203, "kl": 0.006388881476595998, "learning_rate": 3.1507892697701546e-07, "loss": 6.405694148270413e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 451.75, "completions/min_length": 410.0, "epoch": 13.172058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.016997113823890686, "kl": 0.010983944870531559, "learning_rate": 3.14959699540433e-07, "loss": 0.00011027324944734573, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 459.1875, "completions/min_length": 414.0, "epoch": 13.173529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.071799397468567, "kl": 0.00952315516769886, "learning_rate": 3.1484048429383725e-07, "loss": 9.481806046096608e-05, "reward": 0.9802083373069763, "reward_std": 0.055979274213314056, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 8958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 419.625, "completions/min_length": 372.0, "epoch": 13.175, "frac_reward_zero_std": 1.0, "grad_norm": 0.02428065612912178, "kl": 0.011939067160710692, "learning_rate": 3.147212812450818e-07, "loss": 0.0001196553057525307, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/mean_length": 471.0, "completions/min_length": 317.0, "epoch": 13.176470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.0109934750944376, "kl": 0.007218863815069199, "learning_rate": 3.146020904020196e-07, "loss": 7.287417247425765e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 486.4375, "completions/min_length": 450.0, "epoch": 13.177941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.7981871962547302, "kl": 0.010063409921713173, "learning_rate": 3.1448291177250243e-07, "loss": 0.00010034441947937012, "reward": 0.1615833342075348, "reward_std": 0.03767688199877739, "rewards/DrugCombAccuracyCOTORM/mean": 0.027499999850988388, "rewards/DrugCombAccuracyCOTORM/std": 0.07514430582523346, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.3958333432674408, "rewards/DrugCombCoverageCOTORM/std": 0.4254627227783203, "step": 8961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 446.5625, "completions/min_length": 399.0, "epoch": 13.179411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 1.1190394163131714, "kl": 0.022796787321567535, "learning_rate": 3.143637453643815e-07, "loss": 0.00022719745174981654, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 435.25, "completions/min_length": 377.0, "epoch": 13.180882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.016825169324874878, "kl": 0.007065111654810607, "learning_rate": 3.142445911855072e-07, "loss": 7.048440602375194e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/mean_length": 498.9375, "completions/min_length": 408.0, "epoch": 13.18235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.5215798616409302, "kl": 0.01291889138519764, "learning_rate": 3.141254492437292e-07, "loss": 0.00012942765897605568, "reward": 0.7327083349227905, "reward_std": 0.14488457143306732, "rewards/DrugCombAccuracyCOTORM/mean": 0.690625011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.38549479842185974, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8020833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.4876958131790161, "step": 8964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 487.8125, "completions/min_length": 434.0, "epoch": 13.183823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.9531290531158447, "kl": 0.009308097418397665, "learning_rate": 3.140063195468962e-07, "loss": 9.348243474960327e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 443.9375, "completions/min_length": 353.0, "epoch": 13.185294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.010134167969226837, "kl": 0.007264975341968238, "learning_rate": 3.138872021028563e-07, "loss": 7.275761890923604e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 466.6875, "completions/min_length": 390.0, "epoch": 13.186764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 1.0673152208328247, "kl": 0.01068844395922497, "learning_rate": 3.137680969194564e-07, "loss": 0.00010778219439089298, "reward": 0.7047500014305115, "reward_std": 0.11805392056703568, "rewards/DrugCombAccuracyCOTORM/mean": 0.6569792032241821, "rewards/DrugCombAccuracyCOTORM/std": 0.39994361996650696, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.28867512941360474, "step": 8967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 439.3125, "completions/min_length": 405.0, "epoch": 13.188235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.011123976670205593, "kl": 0.007696673972532153, "learning_rate": 3.1364900400454295e-07, "loss": 7.686250319238752e-05, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 8968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 462.0625, "completions/min_length": 375.0, "epoch": 13.189705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9258252382278442, "kl": 0.013133543077856302, "learning_rate": 3.135299233659616e-07, "loss": 0.00013256433885544538, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 434.9375, "completions/min_length": 359.0, "epoch": 13.191176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.7800862193107605, "kl": 0.008350946591235697, "learning_rate": 3.1341085501155695e-07, "loss": 8.331342542078346e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/mean_length": 426.0, "completions/min_length": 373.0, "epoch": 13.19264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.0158313550055027, "kl": 0.009704415686428547, "learning_rate": 3.1329179894917303e-07, "loss": 9.739446977619082e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 450.875, "completions/min_length": 361.0, "epoch": 13.194117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.01869947463274002, "kl": 0.00985186465550214, "learning_rate": 3.131727551866529e-07, "loss": 9.771762415766716e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/mean_length": 506.875, "completions/min_length": 406.0, "epoch": 13.195588235294117, "frac_reward_zero_std": 0.0, "grad_norm": 1.4175152778625488, "kl": 0.016130239702761173, "learning_rate": 3.1305372373183887e-07, "loss": 0.00015620887279510498, "reward": 0.710812509059906, "reward_std": 0.31785866618156433, "rewards/DrugCombAccuracyCOTORM/mean": 0.6404687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.48291152715682983, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.984375, "rewards/DrugCombCoverageCOTORM/std": 0.0625, "step": 8973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/mean_length": 493.0, "completions/min_length": 397.0, "epoch": 13.197058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.0482386350631714, "kl": 0.011445823591202497, "learning_rate": 3.1293470459257234e-07, "loss": 0.00011444836854934692, "reward": 0.690530002117157, "reward_std": 0.13422052562236786, "rewards/DrugCombAccuracyCOTORM/mean": 0.6240999698638916, "rewards/DrugCombAccuracyCOTORM/std": 0.4480207562446594, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9125000238418579, "rewards/DrugCombCoverageCOTORM/std": 0.10246950387954712, "step": 8974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 422.375, "completions/min_length": 377.0, "epoch": 13.198529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.01660894975066185, "kl": 0.008164225611835718, "learning_rate": 3.128156977766941e-07, "loss": 8.161024015862495e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 443.8125, "completions/min_length": 390.0, "epoch": 13.2, "frac_reward_zero_std": 0.5, "grad_norm": 1.0955092906951904, "kl": 0.007114013307727873, "learning_rate": 3.1269670329204393e-07, "loss": 7.06017017364502e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 420.375, "completions/min_length": 372.0, "epoch": 13.201470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.009603691287338734, "kl": 0.006833677762188017, "learning_rate": 3.1257772114646095e-07, "loss": 6.886767368996516e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/mean_length": 492.5625, "completions/min_length": 406.0, "epoch": 13.202941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 1.3820377588272095, "kl": 0.011672260006889701, "learning_rate": 3.124587513477836e-07, "loss": 0.00011585652828216553, "reward": 0.7229166626930237, "reward_std": 0.38668400049209595, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7291666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.6800735592842102, "step": 8978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 436.9375, "completions/min_length": 377.0, "epoch": 13.204411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 1.2246174812316895, "kl": 0.010795976151712239, "learning_rate": 3.123397939038489e-07, "loss": 0.00010766834020614624, "reward": 0.887499988079071, "reward_std": 0.3181980550289154, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 8979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 438.0625, "completions/min_length": 393.0, "epoch": 13.205882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.8554797768592834, "kl": 0.007639637216925621, "learning_rate": 3.1222084882249375e-07, "loss": 7.640332478331402e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 8980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/mean_length": 503.125, "completions/min_length": 383.0, "epoch": 13.20735294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.3526352643966675, "kl": 0.012733039795421064, "learning_rate": 3.1210191611155377e-07, "loss": 0.00012760423123836517, "reward": 0.783750057220459, "reward_std": 0.3023466467857361, "rewards/DrugCombAccuracyCOTORM/mean": 0.7557291388511658, "rewards/DrugCombAccuracyCOTORM/std": 0.3252314031124115, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7916666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.4878145158290863, "step": 8981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 448.5, "completions/min_length": 395.0, "epoch": 13.208823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.03842988982796669, "kl": 0.011617748183198273, "learning_rate": 3.1198299577886414e-07, "loss": 0.00011500349501147866, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 498.1875, "completions/min_length": 435.0, "epoch": 13.21029411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.0316314697265625, "kl": 0.009326187428086996, "learning_rate": 3.118640878322589e-07, "loss": 9.386241436004639e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 8983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 451.0625, "completions/min_length": 390.0, "epoch": 13.211764705882352, "frac_reward_zero_std": 1.0, "grad_norm": 0.009202057495713234, "kl": 0.007540602586232126, "learning_rate": 3.1174519227957153e-07, "loss": 7.544670370407403e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 448.3125, "completions/min_length": 407.0, "epoch": 13.213235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.04792500287294388, "kl": 0.0099269401980564, "learning_rate": 3.1162630912863434e-07, "loss": 9.862236038316041e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/mean_length": 522.6875, "completions/min_length": 435.0, "epoch": 13.214705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.8273633122444153, "kl": 0.008176054805517197, "learning_rate": 3.115074383872792e-07, "loss": 8.23289155960083e-05, "reward": 0.3151041865348816, "reward_std": 0.0044194171205163, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.1666666716337204, "rewards/DrugCombCoverageCOTORM/std": 0.17213259637355804, "step": 8986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 407.1875, "completions/min_length": 331.0, "epoch": 13.216176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.8134646415710449, "kl": 0.009635820635594428, "learning_rate": 3.113885800633371e-07, "loss": 9.618388867238536e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 436.5, "completions/min_length": 379.0, "epoch": 13.217647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.012455827556550503, "kl": 0.008514961693435907, "learning_rate": 3.11269734164638e-07, "loss": 8.516548405168578e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 471.5625, "completions/min_length": 407.0, "epoch": 13.219117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.5162999629974365, "kl": 0.01867379923351109, "learning_rate": 3.1115090069901115e-07, "loss": 0.00018233060836791992, "reward": 0.44999998807907104, "reward_std": 0.20701967179775238, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 448.9375, "completions/min_length": 411.0, "epoch": 13.220588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.8756391406059265, "kl": 0.0075329834362491965, "learning_rate": 3.1103207967428515e-07, "loss": 7.508881390094757e-05, "reward": 0.8142499923706055, "reward_std": 0.20634259283542633, "rewards/DrugCombAccuracyCOTORM/mean": 0.7912499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.3766496777534485, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5013870000839233, "step": 8990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 477.4375, "completions/min_length": 411.0, "epoch": 13.222058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.1667324304580688, "kl": 0.009018518612720072, "learning_rate": 3.109132710982874e-07, "loss": 8.88332724571228e-05, "reward": 0.542187511920929, "reward_std": 0.16351844370365143, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 8991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 419.8125, "completions/min_length": 364.0, "epoch": 13.223529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.09425408393144608, "kl": 0.011050148750655353, "learning_rate": 3.1079447497884485e-07, "loss": 0.00010937020851997659, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 450.75, "completions/min_length": 378.0, "epoch": 13.225, "frac_reward_zero_std": 1.0, "grad_norm": 0.8864374160766602, "kl": 0.024072435218840837, "learning_rate": 3.106756913237835e-07, "loss": 0.00024379894603043795, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 459.875, "completions/min_length": 396.0, "epoch": 13.226470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8711720705032349, "kl": 0.01016610860824585, "learning_rate": 3.105569201409285e-07, "loss": 0.00010230042971670628, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 8994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 734.0, "completions/mean_length": 516.3125, "completions/min_length": 398.0, "epoch": 13.227941176470589, "frac_reward_zero_std": 0.0, "grad_norm": 1.4686026573181152, "kl": 0.015513918129727244, "learning_rate": 3.104381614381041e-07, "loss": 0.0001538395881652832, "reward": 0.41066884994506836, "reward_std": 0.353073388338089, "rewards/DrugCombAccuracyCOTORM/mean": 0.30656522512435913, "rewards/DrugCombAccuracyCOTORM/std": 0.41959118843078613, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6541666984558105, "rewards/DrugCombCoverageCOTORM/std": 0.36124786734580994, "step": 8995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 438.6875, "completions/min_length": 383.0, "epoch": 13.229411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.01752823032438755, "kl": 0.009265419095754623, "learning_rate": 3.103194152231341e-07, "loss": 9.347898594569415e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 486.9375, "completions/min_length": 428.0, "epoch": 13.230882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0538812056183815, "kl": 0.01292197871953249, "learning_rate": 3.102006815038408e-07, "loss": 0.00013069843407720327, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 460.1875, "completions/min_length": 412.0, "epoch": 13.23235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.6363381147384644, "kl": 0.0158593466039747, "learning_rate": 3.1008196028804636e-07, "loss": 0.00015655159950256348, "reward": 0.49270832538604736, "reward_std": 0.3221263587474823, "rewards/DrugCombAccuracyCOTORM/mean": 0.3854166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.39308255910873413, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.84375, "rewards/DrugCombCoverageCOTORM/std": 0.5072392821311951, "step": 8998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 486.3125, "completions/min_length": 411.0, "epoch": 13.233823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 1.2948538064956665, "kl": 0.013191303703933954, "learning_rate": 3.099632515835717e-07, "loss": 0.0001328401267528534, "reward": 0.699999988079071, "reward_std": 0.3484410345554352, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 8999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 447.875, "completions/min_length": 417.0, "epoch": 13.235294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.017790181562304497, "kl": 0.007020089426077902, "learning_rate": 3.098445553982372e-07, "loss": 7.050057320157066e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 431.5, "completions/min_length": 385.0, "epoch": 13.236764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.1450657844543457, "kl": 0.010675679077394307, "learning_rate": 3.0972587173986206e-07, "loss": 0.00010646041482686996, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 455.8125, "completions/min_length": 424.0, "epoch": 13.238235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0083463191986084, "kl": 0.00993927416857332, "learning_rate": 3.096072006162651e-07, "loss": 9.885449253488332e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/mean_length": 492.5, "completions/min_length": 425.0, "epoch": 13.239705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 0.9940731525421143, "kl": 0.010255235712975264, "learning_rate": 3.0948854203526384e-07, "loss": 0.0001024000666802749, "reward": 0.7000000476837158, "reward_std": 0.16402672231197357, "rewards/DrugCombAccuracyCOTORM/mean": 0.65625, "rewards/DrugCombAccuracyCOTORM/std": 0.4236907958984375, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.5773502588272095, "step": 9003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 426.125, "completions/min_length": 366.0, "epoch": 13.241176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.04071568325161934, "kl": 0.010048280004411936, "learning_rate": 3.093698960046753e-07, "loss": 9.965620120055974e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 451.375, "completions/min_length": 374.0, "epoch": 13.242647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.0101686716079712, "kl": 0.008312234189361334, "learning_rate": 3.092512625323156e-07, "loss": 8.34539532661438e-05, "reward": 0.7124166488647461, "reward_std": 0.11620121449232101, "rewards/DrugCombAccuracyCOTORM/mean": 0.6587499976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.3996310830116272, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.17078250646591187, "step": 9005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 405.0, "completions/min_length": 345.0, "epoch": 13.244117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.010618329048156738, "kl": 0.008301633293740451, "learning_rate": 3.09132641626e-07, "loss": 8.274341962533072e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 439.5625, "completions/min_length": 392.0, "epoch": 13.245588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.049296166747808456, "kl": 0.008625684538856149, "learning_rate": 3.090140332935429e-07, "loss": 8.728605462238193e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 420.3125, "completions/min_length": 358.0, "epoch": 13.24705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.01690063439309597, "kl": 0.00972955406177789, "learning_rate": 3.0889543754275816e-07, "loss": 9.689776197774336e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/mean_length": 493.1875, "completions/min_length": 401.0, "epoch": 13.248529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.8429033160209656, "kl": 0.009178963489830494, "learning_rate": 3.087768543814582e-07, "loss": 9.088583465199918e-05, "reward": 0.8818666934967041, "reward_std": 0.1763792186975479, "rewards/DrugCombAccuracyCOTORM/mean": 0.8591041564941406, "rewards/DrugCombAccuracyCOTORM/std": 0.3236616253852844, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9458333253860474, "rewards/DrugCombCoverageCOTORM/std": 0.17078250646591187, "step": 9009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 482.75, "completions/min_length": 383.0, "epoch": 13.25, "frac_reward_zero_std": 0.5, "grad_norm": 1.1737008094787598, "kl": 0.00992075796239078, "learning_rate": 3.086582838174551e-07, "loss": 0.00010002031922340393, "reward": 0.7937500476837158, "reward_std": 0.13567513227462769, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.3333333432674408, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 9010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 436.375, "completions/min_length": 406.0, "epoch": 13.251470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.010912055149674416, "kl": 0.00918291078414768, "learning_rate": 3.0853972585856016e-07, "loss": 9.225512621924281e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 480.5, "completions/min_length": 372.0, "epoch": 13.25294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9598020911216736, "kl": 0.007611811859533191, "learning_rate": 3.0842118051258347e-07, "loss": 7.661286508664489e-05, "reward": 0.7062318325042725, "reward_std": 0.07689610123634338, "rewards/DrugCombAccuracyCOTORM/mean": 0.6510189175605774, "rewards/DrugCombAccuracyCOTORM/std": 0.3806686997413635, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.17078252136707306, "step": 9012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 431.9375, "completions/min_length": 388.0, "epoch": 13.254411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.01028046291321516, "kl": 0.006556678446941078, "learning_rate": 3.0830264778733475e-07, "loss": 6.516927533084527e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/mean_length": 519.75, "completions/min_length": 445.0, "epoch": 13.255882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.8930637836456299, "kl": 0.009701699367724359, "learning_rate": 3.081841276906223e-07, "loss": 9.649604180594906e-05, "reward": 0.875, "reward_std": 0.12232868373394012, "rewards/DrugCombAccuracyCOTORM/mean": 0.84765625, "rewards/DrugCombAccuracyCOTORM/std": 0.2581944167613983, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.06718549132347107, "step": 9014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 437.0, "completions/min_length": 378.0, "epoch": 13.257352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.009872986935079098, "kl": 0.006967502296902239, "learning_rate": 3.0806562023025407e-07, "loss": 7.00042728567496e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 478.1875, "completions/min_length": 408.0, "epoch": 13.258823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 0.746805727481842, "kl": 0.013566159759648144, "learning_rate": 3.079471254140371e-07, "loss": 0.0001350492238998413, "reward": 0.6576874852180481, "reward_std": 0.14201205968856812, "rewards/DrugCombAccuracyCOTORM/mean": 0.5779687166213989, "rewards/DrugCombAccuracyCOTORM/std": 0.4977610111236572, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.953125, "rewards/DrugCombCoverageCOTORM/std": 0.10077822208404541, "step": 9016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 438.625, "completions/min_length": 415.0, "epoch": 13.260294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 1.0439351797103882, "kl": 0.011641611810773611, "learning_rate": 3.078286432497775e-07, "loss": 0.00011627402273006737, "reward": 0.8979166746139526, "reward_std": 0.17597517371177673, "rewards/DrugCombAccuracyCOTORM/mean": 0.8958333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.26440009474754333, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 9017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 447.6875, "completions/min_length": 410.0, "epoch": 13.261764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9578052163124084, "kl": 0.007472959929145873, "learning_rate": 3.077101737452805e-07, "loss": 7.466226816177368e-05, "reward": 0.75, "reward_std": 0.26726123690605164, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 9018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 446.4375, "completions/min_length": 399.0, "epoch": 13.263235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8615937232971191, "kl": 0.011354134185239673, "learning_rate": 3.0759171690835077e-07, "loss": 0.00011417792848078534, "reward": 0.32500001788139343, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.25, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 9019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 460.75, "completions/min_length": 405.0, "epoch": 13.264705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.0973399877548218, "kl": 0.008333238074555993, "learning_rate": 3.074732727467917e-07, "loss": 8.25151801109314e-05, "reward": 0.6989583373069763, "reward_std": 0.16675221920013428, "rewards/DrugCombAccuracyCOTORM/mean": 0.6979166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.3859512209892273, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.40625, "rewards/DrugCombCoverageCOTORM/std": 0.8605957627296448, "step": 9020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 466.125, "completions/min_length": 408.0, "epoch": 13.266176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.3002498149871826, "kl": 0.013338886201381683, "learning_rate": 3.0735484126840615e-07, "loss": 0.00013319068239070475, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 444.25, "completions/min_length": 322.0, "epoch": 13.26764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.009476815350353718, "kl": 0.007256820099428296, "learning_rate": 3.072364224809962e-07, "loss": 7.288659980986267e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 456.3125, "completions/min_length": 399.0, "epoch": 13.269117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.010023403912782669, "kl": 0.006946098292246461, "learning_rate": 3.071180163923629e-07, "loss": 6.957704317755997e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/mean_length": 470.625, "completions/min_length": 357.0, "epoch": 13.270588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.015392055734992027, "kl": 0.009325048187747598, "learning_rate": 3.069996230103066e-07, "loss": 9.161319758277386e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 433.0, "completions/min_length": 386.0, "epoch": 13.272058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 1.55257248878479, "kl": 0.01425444521009922, "learning_rate": 3.0688124234262674e-07, "loss": 0.00014124848530627787, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 9025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 468.5625, "completions/min_length": 369.0, "epoch": 13.273529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8955742716789246, "kl": 0.00950215698685497, "learning_rate": 3.0676287439712184e-07, "loss": 9.498123836237937e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 454.6875, "completions/min_length": 397.0, "epoch": 13.275, "frac_reward_zero_std": 1.0, "grad_norm": 0.010097406804561615, "kl": 0.0076340819941833615, "learning_rate": 3.066445191815897e-07, "loss": 7.637450471520424e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 435.375, "completions/min_length": 369.0, "epoch": 13.276470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.017270417883992195, "kl": 0.0077433588448911905, "learning_rate": 3.065261767038274e-07, "loss": 7.698552508372813e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 412.5, "completions/min_length": 338.0, "epoch": 13.277941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.012818459421396255, "kl": 0.007040228811092675, "learning_rate": 3.0640784697163094e-07, "loss": 7.047421240713447e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 441.75, "completions/min_length": 390.0, "epoch": 13.279411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.8866724967956543, "kl": 0.012141553917899728, "learning_rate": 3.062895299927956e-07, "loss": 0.00012039186549372971, "reward": 0.7749999761581421, "reward_std": 0.24348656833171844, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 9030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 469.25, "completions/min_length": 380.0, "epoch": 13.280882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.6559761762619019, "kl": 0.008149083238095045, "learning_rate": 3.061712257751158e-07, "loss": 8.119208359858021e-05, "reward": 0.9551249742507935, "reward_std": 0.12692566215991974, "rewards/DrugCombAccuracyCOTORM/mean": 0.9478124976158142, "rewards/DrugCombAccuracyCOTORM/std": 0.20874999463558197, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 9031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 436.875, "completions/min_length": 384.0, "epoch": 13.282352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 1.374590516090393, "kl": 0.009559122961945832, "learning_rate": 3.060529343263851e-07, "loss": 9.597465395927429e-05, "reward": 0.7858500480651855, "reward_std": 0.34270596504211426, "rewards/DrugCombAccuracyCOTORM/mean": 0.7369999885559082, "rewards/DrugCombAccuracyCOTORM/std": 0.40974658727645874, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9624999761581421, "rewards/DrugCombCoverageCOTORM/std": 0.08062257617712021, "step": 9032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 445.5, "completions/min_length": 392.0, "epoch": 13.283823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.010416409000754356, "kl": 0.0072125488659366965, "learning_rate": 3.0593465565439636e-07, "loss": 7.228438335005194e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 448.25, "completions/min_length": 381.0, "epoch": 13.285294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.021210703998804092, "kl": 0.007220171391963959, "learning_rate": 3.058163897669412e-07, "loss": 7.190112228272483e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 412.5625, "completions/min_length": 364.0, "epoch": 13.286764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.018238265067338943, "kl": 0.008040815824642777, "learning_rate": 3.0569813667181106e-07, "loss": 8.106325549306348e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 442.5, "completions/min_length": 393.0, "epoch": 13.288235294117648, "frac_reward_zero_std": 0.5, "grad_norm": 0.930753767490387, "kl": 0.008267944562248886, "learning_rate": 3.0557989637679584e-07, "loss": 8.206468191929162e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 9036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 481.4375, "completions/min_length": 415.0, "epoch": 13.28970588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.124819040298462, "kl": 0.012274198117665946, "learning_rate": 3.054616688896852e-07, "loss": 0.00012230873107910156, "reward": 0.49375003576278687, "reward_std": 0.4412066340446472, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 9037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 467.375, "completions/min_length": 425.0, "epoch": 13.291176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.1016237735748291, "kl": 0.01607110002078116, "learning_rate": 3.0534345421826736e-07, "loss": 0.00015963234181981534, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 454.25, "completions/min_length": 390.0, "epoch": 13.29264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.006636543199419975, "kl": 0.005828265682794154, "learning_rate": 3.0522525237033013e-07, "loss": 5.85414600209333e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 408.9375, "completions/min_length": 340.0, "epoch": 13.294117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.02671247348189354, "kl": 0.010814814129844308, "learning_rate": 3.0510706335366034e-07, "loss": 0.00010699580889195204, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/mean_length": 497.125, "completions/min_length": 408.0, "epoch": 13.295588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.1333800554275513, "kl": 0.009727128548547626, "learning_rate": 3.0498888717604406e-07, "loss": 9.707361459732056e-05, "reward": 0.9923294186592102, "reward_std": 0.021695686504244804, "rewards/DrugCombAccuracyCOTORM/mean": 0.9904117584228516, "rewards/DrugCombAccuracyCOTORM/std": 0.03835293650627136, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 446.4375, "completions/min_length": 407.0, "epoch": 13.297058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.9067978262901306, "kl": 0.009107148624025285, "learning_rate": 3.0487072384526633e-07, "loss": 9.073317050933838e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 9042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/mean_length": 464.625, "completions/min_length": 373.0, "epoch": 13.298529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8798612952232361, "kl": 0.010606299503706396, "learning_rate": 3.0475257336911164e-07, "loss": 0.00010700518760131672, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 9043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 468.0625, "completions/min_length": 385.0, "epoch": 13.3, "frac_reward_zero_std": 0.5, "grad_norm": 1.1431708335876465, "kl": 0.0081078065559268, "learning_rate": 3.0463443575536317e-07, "loss": 8.097290992736816e-05, "reward": 0.9937499761581421, "reward_std": 0.017677659168839455, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 9044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/mean_length": 475.25, "completions/min_length": 427.0, "epoch": 13.301470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 0.7450515627861023, "kl": 0.008879199856892228, "learning_rate": 3.045163110118037e-07, "loss": 8.890928438631818e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 9045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 427.6875, "completions/min_length": 381.0, "epoch": 13.302941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.079532191157341, "kl": 0.009719288675114512, "learning_rate": 3.0439819914621496e-07, "loss": 9.745347779244184e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 440.3125, "completions/min_length": 365.0, "epoch": 13.304411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 1.3953510522842407, "kl": 0.00892670510802418, "learning_rate": 3.042801001663778e-07, "loss": 8.890032768249512e-05, "reward": 0.6089166402816772, "reward_std": 0.2576225996017456, "rewards/DrugCombAccuracyCOTORM/mean": 0.5137500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.5050000548362732, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 9047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 498.3125, "completions/min_length": 434.0, "epoch": 13.305882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.10415780544281, "kl": 0.011120176874101162, "learning_rate": 3.0416201408007235e-07, "loss": 0.00011198091669939458, "reward": 0.5860000252723694, "reward_std": 0.04615088924765587, "rewards/DrugCombAccuracyCOTORM/mean": 0.5137500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.5050000548362732, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.49441322684288025, "step": 9048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 410.8125, "completions/min_length": 355.0, "epoch": 13.30735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.032505277544260025, "kl": 0.010418249759823084, "learning_rate": 3.04043940895078e-07, "loss": 0.00010361027671024203, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 505.9375, "completions/min_length": 485.0, "epoch": 13.308823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.7411673665046692, "kl": 0.008450735243968666, "learning_rate": 3.0392588061917275e-07, "loss": 8.442725811619312e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/mean_length": 460.5, "completions/min_length": 414.0, "epoch": 13.310294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.4115756750106812, "kl": 0.012936535757035017, "learning_rate": 3.0380783326013424e-07, "loss": 0.00013034790754318237, "reward": 0.4750000238418579, "reward_std": 0.4256991147994995, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 9051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/mean_length": 499.0625, "completions/min_length": 438.0, "epoch": 13.311764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.011925945989787579, "kl": 0.00797776144463569, "learning_rate": 3.0368979882573917e-07, "loss": 7.955943146953359e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/mean_length": 426.0625, "completions/min_length": 392.0, "epoch": 13.313235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.021061034873127937, "kl": 0.01012385357171297, "learning_rate": 3.0357177732376347e-07, "loss": 0.00010119027137989178, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 460.6875, "completions/min_length": 422.0, "epoch": 13.314705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.016668055206537247, "kl": 0.008566893520765007, "learning_rate": 3.0345376876198194e-07, "loss": 8.520733535988256e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 424.25, "completions/min_length": 372.0, "epoch": 13.316176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.012632668018341064, "kl": 0.008420754922553897, "learning_rate": 3.0333577314816875e-07, "loss": 8.373410673812032e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 452.4375, "completions/min_length": 415.0, "epoch": 13.31764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.2140588760375977, "kl": 0.012270769802853465, "learning_rate": 3.032177904900971e-07, "loss": 0.0001226663589477539, "reward": 0.6000000238418579, "reward_std": 0.37032803893089294, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 473.8125, "completions/min_length": 397.0, "epoch": 13.319117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.7686807513237, "kl": 0.010174820548854768, "learning_rate": 3.030998207955394e-07, "loss": 0.00010105158435180783, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 9057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 449.5, "completions/min_length": 404.0, "epoch": 13.320588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.008810300379991531, "kl": 0.006859736400656402, "learning_rate": 3.029818640722672e-07, "loss": 6.811233470216393e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 452.9375, "completions/min_length": 393.0, "epoch": 13.322058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.014418856240808964, "kl": 0.009269515983760357, "learning_rate": 3.028639203280512e-07, "loss": 9.209710697177798e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/mean_length": 449.0, "completions/min_length": 355.0, "epoch": 13.323529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 0.8660316467285156, "kl": 0.00595125462859869, "learning_rate": 3.027459895706613e-07, "loss": 5.9110112488269806e-05, "reward": 0.9089166522026062, "reward_std": 0.16972768306732178, "rewards/DrugCombAccuracyCOTORM/mean": 0.8887500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.30663496255874634, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 9060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 481.6875, "completions/min_length": 441.0, "epoch": 13.325, "frac_reward_zero_std": 1.0, "grad_norm": 0.009402427822351456, "kl": 0.007328870357014239, "learning_rate": 3.026280718078664e-07, "loss": 7.326464401558042e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 442.5, "completions/min_length": 388.0, "epoch": 13.326470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.016113746911287308, "kl": 0.009661111049354076, "learning_rate": 3.025101670474347e-07, "loss": 9.672107262304053e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 453.75, "completions/min_length": 397.0, "epoch": 13.327941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.2490671873092651, "kl": 0.008346925489604473, "learning_rate": 3.0239227529713327e-07, "loss": 8.375570178031921e-05, "reward": 0.6875, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 9063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 446.8125, "completions/min_length": 392.0, "epoch": 13.329411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.8031873106956482, "kl": 0.006261504604481161, "learning_rate": 3.0227439656472876e-07, "loss": 6.307289004325867e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 426.125, "completions/min_length": 380.0, "epoch": 13.330882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.0841634273529053, "kl": 0.013277591788209975, "learning_rate": 3.021565308579866e-07, "loss": 0.00013250112533569336, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 727.0, "completions/mean_length": 469.0, "completions/min_length": 292.0, "epoch": 13.33235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.7751719951629639, "kl": 0.006929850904271007, "learning_rate": 3.020386781846715e-07, "loss": 6.995536386966705e-05, "reward": 0.9575520753860474, "reward_std": 0.08023973554372787, "rewards/DrugCombAccuracyCOTORM/mean": 0.9479166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.145535409450531, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9921875, "rewards/DrugCombCoverageCOTORM/std": 0.03125, "step": 9066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 451.875, "completions/min_length": 375.0, "epoch": 13.333823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.04884137585759163, "kl": 0.010053038247860968, "learning_rate": 3.0192083855254735e-07, "loss": 0.00010031973215518519, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 456.4375, "completions/min_length": 426.0, "epoch": 13.33529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8966184258460999, "kl": 0.01083421939983964, "learning_rate": 3.018030119693771e-07, "loss": 0.00010772919631563127, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 498.375, "completions/min_length": 429.0, "epoch": 13.336764705882352, "frac_reward_zero_std": 0.0, "grad_norm": 1.4036167860031128, "kl": 0.01426719012670219, "learning_rate": 3.0168519844292273e-07, "loss": 0.00014376267790794373, "reward": 0.2825833261013031, "reward_std": 0.23222069442272186, "rewards/DrugCombAccuracyCOTORM/mean": 0.22562500834465027, "rewards/DrugCombAccuracyCOTORM/std": 0.41292405128479004, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.020833313465118408, "rewards/DrugCombCoverageCOTORM/std": 0.873212456703186, "step": 9069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 438.1875, "completions/min_length": 369.0, "epoch": 13.338235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.010859206318855286, "kl": 0.006799141061492264, "learning_rate": 3.015673979809457e-07, "loss": 6.809573824284598e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/mean_length": 382.5625, "completions/min_length": 319.0, "epoch": 13.339705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 1.5467926263809204, "kl": 0.009796235943213105, "learning_rate": 3.0144961059120624e-07, "loss": 9.679794311523438e-05, "reward": 0.8999999761581421, "reward_std": 0.2828426957130432, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/mean_length": 495.6875, "completions/min_length": 423.0, "epoch": 13.341176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.015238931402564049, "kl": 0.007776105427183211, "learning_rate": 3.0133183628146397e-07, "loss": 7.759901927784085e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 433.6875, "completions/min_length": 361.0, "epoch": 13.342647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.023853372782468796, "kl": 0.00958676217123866, "learning_rate": 3.012140750594777e-07, "loss": 9.559921454638243e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 438.875, "completions/min_length": 390.0, "epoch": 13.344117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.024344392120838165, "kl": 0.008608376607298851, "learning_rate": 3.01096326933005e-07, "loss": 8.602317393524572e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 495.0625, "completions/min_length": 402.0, "epoch": 13.345588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.2752703428268433, "kl": 0.019139716285280883, "learning_rate": 3.009785919098029e-07, "loss": 0.0001919567584991455, "reward": 0.6182500123977661, "reward_std": 0.04212481901049614, "rewards/DrugCombAccuracyCOTORM/mean": 0.5618749856948853, "rewards/DrugCombAccuracyCOTORM/std": 0.45549196004867554, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.35939764976501465, "step": 9075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 430.875, "completions/min_length": 376.0, "epoch": 13.347058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.011129852384328842, "kl": 0.010196003830060363, "learning_rate": 3.008608699976275e-07, "loss": 0.00010161032696487382, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 450.6875, "completions/min_length": 375.0, "epoch": 13.348529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 1.0029603242874146, "kl": 0.009027534862980247, "learning_rate": 3.0074316120423405e-07, "loss": 8.986145257949829e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 759.0, "completions/mean_length": 556.25, "completions/min_length": 438.0, "epoch": 13.35, "frac_reward_zero_std": 0.5, "grad_norm": 0.6533583402633667, "kl": 0.008937560603953898, "learning_rate": 3.006254655373769e-07, "loss": 8.8961111032404e-05, "reward": 0.7229982614517212, "reward_std": 0.14263848960399628, "rewards/DrugCombAccuracyCOTORM/mean": 0.6771852970123291, "rewards/DrugCombAccuracyCOTORM/std": 0.38744574785232544, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5107547044754028, "step": 9078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 456.9375, "completions/min_length": 386.0, "epoch": 13.351470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.024624165147542953, "kl": 0.012105812784284353, "learning_rate": 3.005077830048096e-07, "loss": 0.0001207847089972347, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 446.0625, "completions/min_length": 364.0, "epoch": 13.352941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 0.8473586440086365, "kl": 0.017721243435516953, "learning_rate": 3.0039011361428464e-07, "loss": 0.00018077329150401056, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/mean_length": 492.375, "completions/min_length": 404.0, "epoch": 13.354411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.01821775734424591, "kl": 0.012905123527161777, "learning_rate": 3.0027245737355375e-07, "loss": 0.0001233257062267512, "reward": 0.7016666531562805, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.637499988079071, "rewards/DrugCombAccuracyCOTORM/std": 0.3743883967399597, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.08606630563735962, "step": 9081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/mean_length": 527.4375, "completions/min_length": 430.0, "epoch": 13.355882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 1.3582847118377686, "kl": 0.0162192489951849, "learning_rate": 3.00154814290368e-07, "loss": 0.00015950947999954224, "reward": 0.4625000059604645, "reward_std": 0.36878854036331177, "rewards/DrugCombAccuracyCOTORM/mean": 0.34375, "rewards/DrugCombAccuracyCOTORM/std": 0.4366062581539154, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 9082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/mean_length": 455.5, "completions/min_length": 370.0, "epoch": 13.35735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8590197563171387, "kl": 0.009231383679434657, "learning_rate": 3.0003718437247737e-07, "loss": 9.124726057052612e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 9083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/mean_length": 397.5625, "completions/min_length": 341.0, "epoch": 13.358823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.015271504409611225, "kl": 0.009426939417608082, "learning_rate": 2.9991956762763095e-07, "loss": 9.523496555630118e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/mean_length": 562.625, "completions/min_length": 487.0, "epoch": 13.360294117647058, "frac_reward_zero_std": 0.0, "grad_norm": 1.0668708086013794, "kl": 0.010471844812855124, "learning_rate": 2.998019640635772e-07, "loss": 0.00010525062680244446, "reward": 0.4918641448020935, "reward_std": 0.1310897022485733, "rewards/DrugCombAccuracyCOTORM/mean": 0.4234239161014557, "rewards/DrugCombAccuracyCOTORM/std": 0.357963889837265, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.53125, "rewards/DrugCombCoverageCOTORM/std": 0.6164977550506592, "step": 9085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 452.6875, "completions/min_length": 385.0, "epoch": 13.361764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.015412322245538235, "kl": 0.009823905536904931, "learning_rate": 2.996843736880632e-07, "loss": 9.774741920409724e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 451.4375, "completions/min_length": 393.0, "epoch": 13.363235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.013522109016776085, "kl": 0.008070319658145308, "learning_rate": 2.995667965088359e-07, "loss": 8.032740879571065e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 9087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 435.25, "completions/min_length": 387.0, "epoch": 13.364705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.014674385078251362, "kl": 0.010767213301733136, "learning_rate": 2.994492325336406e-07, "loss": 0.0001075338659575209, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 447.8125, "completions/min_length": 376.0, "epoch": 13.366176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.009132971987128258, "kl": 0.0069019884103909135, "learning_rate": 2.993316817702225e-07, "loss": 6.85781124047935e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 429.625, "completions/min_length": 372.0, "epoch": 13.367647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.03994777798652649, "kl": 0.00967367913108319, "learning_rate": 2.9921414422632534e-07, "loss": 9.702273382572457e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/mean_length": 515.75, "completions/min_length": 447.0, "epoch": 13.369117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 1.2190971374511719, "kl": 0.009384154109284282, "learning_rate": 2.9909661990969236e-07, "loss": 9.359791874885559e-05, "reward": 0.6625000238418579, "reward_std": 0.36080265045166016, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.4281744360923767, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.4281744360923767, "step": 9091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 434.4375, "completions/min_length": 385.0, "epoch": 13.370588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.013505593873560429, "kl": 0.007467895979061723, "learning_rate": 2.989791088280655e-07, "loss": 7.459981134161353e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 9092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/mean_length": 502.5, "completions/min_length": 348.0, "epoch": 13.37205882352941, "frac_reward_zero_std": 0.0, "grad_norm": 1.2137126922607422, "kl": 0.010653938632458448, "learning_rate": 2.9886161098918623e-07, "loss": 0.00010733306407928467, "reward": 0.4000000059604645, "reward_std": 0.40868258476257324, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 9093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 445.9375, "completions/min_length": 398.0, "epoch": 13.373529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 13.630817413330078, "kl": 0.0733833301346749, "learning_rate": 2.98744126400795e-07, "loss": 0.0007712369551882148, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/mean_length": 406.9375, "completions/min_length": 361.0, "epoch": 13.375, "frac_reward_zero_std": 1.0, "grad_norm": 0.009199238382279873, "kl": 0.0066056434297934175, "learning_rate": 2.9862665507063144e-07, "loss": 6.628362461924553e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 445.9375, "completions/min_length": 384.0, "epoch": 13.376470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.1127703189849854, "kl": 0.009596157819032669, "learning_rate": 2.9850919700643416e-07, "loss": 9.609402331989259e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 497.9375, "completions/min_length": 454.0, "epoch": 13.37794117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.3642398118972778, "kl": 0.009693736908957362, "learning_rate": 2.9839175221594127e-07, "loss": 9.690597653388977e-05, "reward": 0.7010399103164673, "reward_std": 0.3379477262496948, "rewards/DrugCombAccuracyCOTORM/mean": 0.6542946696281433, "rewards/DrugCombAccuracyCOTORM/std": 0.41480007767677307, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7760416865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5116411447525024, "step": 9097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 457.8125, "completions/min_length": 395.0, "epoch": 13.379411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.012948334217071533, "kl": 0.00968744303099811, "learning_rate": 2.982743207068894e-07, "loss": 9.701292583486065e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/mean_length": 524.5625, "completions/min_length": 462.0, "epoch": 13.380882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 1.2228058576583862, "kl": 0.014023337280377746, "learning_rate": 2.981569024870147e-07, "loss": 0.00013748928904533386, "reward": 0.6026666760444641, "reward_std": 0.35890668630599976, "rewards/DrugCombAccuracyCOTORM/mean": 0.5137500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.5050000548362732, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.14907120168209076, "step": 9099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 461.5, "completions/min_length": 407.0, "epoch": 13.382352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.8195587396621704, "kl": 0.010344271082431078, "learning_rate": 2.9803949756405254e-07, "loss": 0.00010287927580066025, "reward": 0.9178333282470703, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.9025000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.26642072200775146, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 9100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 446.1875, "completions/min_length": 395.0, "epoch": 13.383823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 0.9635434746742249, "kl": 0.009338956791907549, "learning_rate": 2.979221059457372e-07, "loss": 9.417533874511719e-05, "reward": 0.3213333487510681, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.17249999940395355, "rewards/DrugCombAccuracyCOTORM/std": 0.2464548647403717, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 9101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 423.9375, "completions/min_length": 370.0, "epoch": 13.385294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.025688396766781807, "kl": 0.008030858705751598, "learning_rate": 2.9780472763980213e-07, "loss": 8.075112418737262e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 435.375, "completions/min_length": 366.0, "epoch": 13.386764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.011445448733866215, "kl": 0.009206936461851, "learning_rate": 2.9768736265398e-07, "loss": 9.192567085847259e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 439.6875, "completions/min_length": 407.0, "epoch": 13.388235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.051721010357141495, "kl": 0.009423777111805975, "learning_rate": 2.9757001099600224e-07, "loss": 9.292629692936316e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/mean_length": 538.125, "completions/min_length": 470.0, "epoch": 13.389705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.6236274242401123, "kl": 0.011067437008023262, "learning_rate": 2.9745267267359995e-07, "loss": 0.00011131046630907804, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/mean_length": 525.375, "completions/min_length": 369.0, "epoch": 13.391176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 1.4357396364212036, "kl": 0.01835021050646901, "learning_rate": 2.9733534769450296e-07, "loss": 0.0001818537712097168, "reward": 0.717861533164978, "reward_std": 0.32680225372314453, "rewards/DrugCombAccuracyCOTORM/mean": 0.6714154481887817, "rewards/DrugCombAccuracyCOTORM/std": 0.3908659815788269, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8072916865348816, "rewards/DrugCombCoverageCOTORM/std": 0.4010619521141052, "step": 9106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 422.0625, "completions/min_length": 368.0, "epoch": 13.39264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.14072617888450623, "kl": 0.011785236420109868, "learning_rate": 2.9721803606644045e-07, "loss": 0.000117477000458166, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 403.3125, "completions/min_length": 331.0, "epoch": 13.394117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.015701035037636757, "kl": 0.007351075066253543, "learning_rate": 2.971007377971405e-07, "loss": 7.379493763437495e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 488.5, "completions/min_length": 445.0, "epoch": 13.395588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.011480339802801609, "kl": 0.007276564952917397, "learning_rate": 2.9698345289433056e-07, "loss": 7.307998021133244e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 424.0625, "completions/min_length": 343.0, "epoch": 13.397058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.008937638252973557, "kl": 0.0064383510034531355, "learning_rate": 2.968661813657369e-07, "loss": 6.451706576626748e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/mean_length": 475.125, "completions/min_length": 365.0, "epoch": 13.398529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9324155449867249, "kl": 0.00941014033742249, "learning_rate": 2.967489232190852e-07, "loss": 9.497255086898804e-05, "reward": 0.6263333559036255, "reward_std": 0.061930831521749496, "rewards/DrugCombAccuracyCOTORM/mean": 0.5641666650772095, "rewards/DrugCombAccuracyCOTORM/std": 0.45625773072242737, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.49441322684288025, "step": 9111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 469.9375, "completions/min_length": 399.0, "epoch": 13.4, "frac_reward_zero_std": 1.0, "grad_norm": 0.05300402268767357, "kl": 0.011113883927464485, "learning_rate": 2.9663167846209996e-07, "loss": 0.00010999292135238647, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 412.5, "completions/min_length": 354.0, "epoch": 13.401470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.0463441610336304, "kl": 0.009067930397577584, "learning_rate": 2.965144471025051e-07, "loss": 8.924305438995361e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 9113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 459.6875, "completions/min_length": 406.0, "epoch": 13.402941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.03823874518275261, "kl": 0.00857765635009855, "learning_rate": 2.9639722914802355e-07, "loss": 8.524712029611692e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/mean_length": 481.0625, "completions/min_length": 340.0, "epoch": 13.404411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.905285656452179, "kl": 0.0101683943066746, "learning_rate": 2.9628002460637737e-07, "loss": 0.00010247528553009033, "reward": 0.8365777730941772, "reward_std": 0.09976789355278015, "rewards/DrugCombAccuracyCOTORM/mean": 0.8235000371932983, "rewards/DrugCombAccuracyCOTORM/std": 0.2422863245010376, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7777777910232544, "rewards/DrugCombCoverageCOTORM/std": 0.25337231159210205, "step": 9115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/mean_length": 453.375, "completions/min_length": 358.0, "epoch": 13.405882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.9557948708534241, "kl": 0.008673508535139263, "learning_rate": 2.9616283348528755e-07, "loss": 8.73953104019165e-05, "reward": 0.5221111178398132, "reward_std": 0.031422026455402374, "rewards/DrugCombAccuracyCOTORM/mean": 0.5137500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.5034001469612122, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.1111111044883728, "rewards/DrugCombCoverageCOTORM/std": 0.9677941799163818, "step": 9116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 442.375, "completions/min_length": 405.0, "epoch": 13.407352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.03070988319814205, "kl": 0.010596510954201221, "learning_rate": 2.9604565579247436e-07, "loss": 0.00010571055463515222, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 462.1875, "completions/min_length": 415.0, "epoch": 13.408823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.009617469273507595, "kl": 0.006424581864848733, "learning_rate": 2.9592849153565724e-07, "loss": 6.414092058548704e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/mean_length": 510.5625, "completions/min_length": 440.0, "epoch": 13.410294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.892169713973999, "kl": 0.012613060767762363, "learning_rate": 2.958113407225547e-07, "loss": 0.0001259753480553627, "reward": 0.6822916865348816, "reward_std": 0.19656966626644135, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8229166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.30103984475135803, "step": 9119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 449.875, "completions/min_length": 391.0, "epoch": 13.411764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.03746361285448074, "kl": 0.012433835538104177, "learning_rate": 2.9569420336088423e-07, "loss": 0.00012528077058959752, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 445.5, "completions/min_length": 379.0, "epoch": 13.413235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.01986253447830677, "kl": 0.010731679736636579, "learning_rate": 2.9557707945836277e-07, "loss": 0.0001072115555871278, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 489.5625, "completions/min_length": 442.0, "epoch": 13.41470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.5189151763916016, "kl": 0.013002553954720497, "learning_rate": 2.9545996902270585e-07, "loss": 0.000128820538520813, "reward": 0.9270833730697632, "reward_std": 0.2062394767999649, "rewards/DrugCombAccuracyCOTORM/mean": 0.9166666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 9122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 449.125, "completions/min_length": 393.0, "epoch": 13.416176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 1.1845874786376953, "kl": 0.010532753076404333, "learning_rate": 2.9534287206162857e-07, "loss": 0.00010513514280319214, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 449.5, "completions/min_length": 397.0, "epoch": 13.41764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.6356059312820435, "kl": 0.00958994822576642, "learning_rate": 2.952257885828449e-07, "loss": 9.608268737792969e-05, "reward": 0.7122499942779541, "reward_std": 0.3396519124507904, "rewards/DrugCombAccuracyCOTORM/mean": 0.6559374928474426, "rewards/DrugCombAccuracyCOTORM/std": 0.4617077708244324, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.22360680997371674, "step": 9124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 430.5625, "completions/min_length": 392.0, "epoch": 13.419117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.015835091471672058, "kl": 0.009429614059627056, "learning_rate": 2.951087185940681e-07, "loss": 9.375937224831432e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 445.625, "completions/min_length": 403.0, "epoch": 13.420588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.01604507863521576, "kl": 0.007174195256084204, "learning_rate": 2.949916621030104e-07, "loss": 7.167512376327068e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/mean_length": 438.5, "completions/min_length": 354.0, "epoch": 13.422058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.8265008330345154, "kl": 0.00946629187092185, "learning_rate": 2.9487461911738333e-07, "loss": 9.474903345108032e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 9127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 461.375, "completions/min_length": 395.0, "epoch": 13.423529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9024080038070679, "kl": 0.006941743311472237, "learning_rate": 2.947575896448971e-07, "loss": 6.919533188920468e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 9128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/mean_length": 512.1875, "completions/min_length": 413.0, "epoch": 13.425, "frac_reward_zero_std": 0.5, "grad_norm": 0.9298730492591858, "kl": 0.009110461687669158, "learning_rate": 2.9464057369326143e-07, "loss": 9.055350528797135e-05, "reward": 0.9416666626930237, "reward_std": 0.10801234096288681, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.17078252136707306, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.2277100384235382, "step": 9129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 441.0, "completions/min_length": 402.0, "epoch": 13.426470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 0.9297679662704468, "kl": 0.009459801949560642, "learning_rate": 2.9452357127018514e-07, "loss": 9.429547208128497e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 431.0625, "completions/min_length": 376.0, "epoch": 13.427941176470588, "frac_reward_zero_std": 1.0, "grad_norm": Infinity, "kl": 4.057938434557806e+32, "learning_rate": 2.944065823833759e-07, "loss": 4.371020905386869e+30, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 479.25, "completions/min_length": 380.0, "epoch": 13.429411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.017089439556002617, "kl": 0.00898062961641699, "learning_rate": 2.942896070405407e-07, "loss": 9.003809827845544e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 463.125, "completions/min_length": 400.0, "epoch": 13.430882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.011521040461957455, "kl": 0.007131917518563569, "learning_rate": 2.9417264524938583e-07, "loss": 7.099704816937447e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 476.1875, "completions/min_length": 429.0, "epoch": 13.43235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.7604448795318604, "kl": 0.012613683007657528, "learning_rate": 2.94055697017616e-07, "loss": 0.00012625669478438795, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 431.125, "completions/min_length": 385.0, "epoch": 13.433823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.8939282894134521, "kl": 0.009284369531087577, "learning_rate": 2.9393876235293577e-07, "loss": 9.251928713638335e-05, "reward": 0.5, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 454.25, "completions/min_length": 374.0, "epoch": 13.435294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.010778072290122509, "kl": 0.007093179854564369, "learning_rate": 2.9382184126304833e-07, "loss": 7.103054667823017e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 464.875, "completions/min_length": 365.0, "epoch": 13.436764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 0.8013336062431335, "kl": 0.006609362200833857, "learning_rate": 2.937049337556562e-07, "loss": 6.583333015441895e-05, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 466.8125, "completions/min_length": 428.0, "epoch": 13.438235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.01631740853190422, "kl": 0.009456755826249719, "learning_rate": 2.93588039838461e-07, "loss": 9.473631507717073e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/mean_length": 516.0625, "completions/min_length": 414.0, "epoch": 13.439705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 1.3630187511444092, "kl": 0.013895156560465693, "learning_rate": 2.9347115951916345e-07, "loss": 0.00013890862464904785, "reward": 0.5740889310836792, "reward_std": 0.413959264755249, "rewards/DrugCombAccuracyCOTORM/mean": 0.4697812497615814, "rewards/DrugCombAccuracyCOTORM/std": 0.49904316663742065, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9826388955116272, "rewards/DrugCombCoverageCOTORM/std": 0.0694444477558136, "step": 9139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 443.4375, "completions/min_length": 381.0, "epoch": 13.441176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.940033495426178, "kl": 0.006344513443764299, "learning_rate": 2.933542928054632e-07, "loss": 6.297959771472961e-05, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 9140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 459.875, "completions/min_length": 385.0, "epoch": 13.44264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.012882069684565067, "kl": 0.007428883574903011, "learning_rate": 2.932374397050591e-07, "loss": 7.448461110470816e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 443.6875, "completions/min_length": 371.0, "epoch": 13.444117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.8885719776153564, "kl": 0.008481285301968455, "learning_rate": 2.931206002256493e-07, "loss": 8.386154513573274e-05, "reward": 0.949999988079071, "reward_std": 0.09258200973272324, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.17078252136707306, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 411.625, "completions/min_length": 350.0, "epoch": 13.445588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.02628082036972046, "kl": 0.010928376810625196, "learning_rate": 2.9300377437493074e-07, "loss": 0.00010836278670467436, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 423.6875, "completions/min_length": 369.0, "epoch": 13.447058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.07844868302345276, "kl": 0.01117250183597207, "learning_rate": 2.9288696216059974e-07, "loss": 0.0001135456986958161, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/mean_length": 450.875, "completions/min_length": 358.0, "epoch": 13.448529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 0.7560102343559265, "kl": 0.009409984573721886, "learning_rate": 2.9277016359035163e-07, "loss": 9.55481082201004e-05, "reward": 0.8889166712760925, "reward_std": 0.13248057663440704, "rewards/DrugCombAccuracyCOTORM/mean": 0.8741666674613953, "rewards/DrugCombAccuracyCOTORM/std": 0.25863316655158997, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333134651184, "rewards/DrugCombCoverageCOTORM/std": 0.291070818901062, "step": 9145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 466.3125, "completions/min_length": 374.0, "epoch": 13.45, "frac_reward_zero_std": 0.5, "grad_norm": 1.0209013223648071, "kl": 0.012748540844768286, "learning_rate": 2.9265337867188055e-07, "loss": 0.00012819317635148764, "reward": 0.6273333430290222, "reward_std": 0.04703797399997711, "rewards/DrugCombAccuracyCOTORM/mean": 0.5550000071525574, "rewards/DrugCombAccuracyCOTORM/std": 0.4665619134902954, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 9146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/mean_length": 491.25, "completions/min_length": 375.0, "epoch": 13.451470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.7210634350776672, "kl": 0.012999276630580425, "learning_rate": 2.925366074128802e-07, "loss": 0.0001296699047088623, "reward": 0.6075763702392578, "reward_std": 0.09876640141010284, "rewards/DrugCombAccuracyCOTORM/mean": 0.5389843583106995, "rewards/DrugCombAccuracyCOTORM/std": 0.4993995130062103, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7638888955116272, "rewards/DrugCombCoverageCOTORM/std": 0.5113928318023682, "step": 9147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 428.625, "completions/min_length": 375.0, "epoch": 13.452941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.01592765375971794, "kl": 0.008844029856845737, "learning_rate": 2.9241984982104317e-07, "loss": 8.864609844749793e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/mean_length": 413.0625, "completions/min_length": 380.0, "epoch": 13.454411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.013770166784524918, "kl": 0.0066304231295362115, "learning_rate": 2.9230310590406106e-07, "loss": 6.607002433156595e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 455.3125, "completions/min_length": 384.0, "epoch": 13.455882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.017299631610512733, "kl": 0.007984017138369381, "learning_rate": 2.921863756696247e-07, "loss": 7.976062624948099e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/mean_length": 501.4375, "completions/min_length": 437.0, "epoch": 13.45735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.3117507696151733, "kl": 0.011013096431270242, "learning_rate": 2.920696591254241e-07, "loss": 0.00011113994696643203, "reward": 0.7520833611488342, "reward_std": 0.17721109092235565, "rewards/DrugCombAccuracyCOTORM/mean": 0.7291666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.3890872597694397, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 9151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 449.8125, "completions/min_length": 354.0, "epoch": 13.458823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.012543048709630966, "kl": 0.007052620989270508, "learning_rate": 2.9195295627914817e-07, "loss": 7.071703294059262e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 413.6875, "completions/min_length": 330.0, "epoch": 13.46029411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.1315346956253052, "kl": 0.010764731327071786, "learning_rate": 2.9183626713848487e-07, "loss": 0.0001076189219020307, "reward": 0.6875, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 9153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 448.9375, "completions/min_length": 396.0, "epoch": 13.461764705882352, "frac_reward_zero_std": 1.0, "grad_norm": 0.023851366713643074, "kl": 0.00965575291775167, "learning_rate": 2.917195917111215e-07, "loss": 9.748378943186253e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 437.375, "completions/min_length": 406.0, "epoch": 13.463235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01495092362165451, "kl": 0.006749148014932871, "learning_rate": 2.916029300047443e-07, "loss": 6.74969342071563e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/mean_length": 504.9375, "completions/min_length": 422.0, "epoch": 13.464705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.8718467354774475, "kl": 0.012155916658230126, "learning_rate": 2.914862820270388e-07, "loss": 0.00012303143739700317, "reward": 0.942187488079071, "reward_std": 0.16351844370365143, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 9156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 467.4375, "completions/min_length": 415.0, "epoch": 13.466176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.0200315173715353, "kl": 0.009208241943269968, "learning_rate": 2.9136964778568927e-07, "loss": 9.213812882080674e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/mean_length": 484.125, "completions/min_length": 389.0, "epoch": 13.467647058823529, "frac_reward_zero_std": 0.0, "grad_norm": 1.4492443799972534, "kl": 0.02482702978886664, "learning_rate": 2.912530272883794e-07, "loss": 0.00024347007274627686, "reward": 0.7312500476837158, "reward_std": 0.41806113719940186, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 9158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 441.0, "completions/min_length": 393.0, "epoch": 13.469117647058823, "frac_reward_zero_std": 0.0, "grad_norm": 1.3578845262527466, "kl": 0.01409687427803874, "learning_rate": 2.911364205427918e-07, "loss": 0.00014004111289978027, "reward": 0.6294166445732117, "reward_std": 0.39531248807907104, "rewards/DrugCombAccuracyCOTORM/mean": 0.5550000071525574, "rewards/DrugCombAccuracyCOTORM/std": 0.4665619134902954, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.2713136672973633, "step": 9159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 430.625, "completions/min_length": 341.0, "epoch": 13.470588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.7654810547828674, "kl": 0.00699375185649842, "learning_rate": 2.9101982755660846e-07, "loss": 6.971840048208833e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/mean_length": 529.75, "completions/min_length": 457.0, "epoch": 13.472058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.7806636691093445, "kl": 0.011183812166564167, "learning_rate": 2.909032483375098e-07, "loss": 0.00011190296208951622, "reward": 0.8755208253860474, "reward_std": 0.19556640088558197, "rewards/DrugCombAccuracyCOTORM/mean": 0.8541666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.3435921370983124, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 9161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 490.6875, "completions/min_length": 415.0, "epoch": 13.473529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.8450127840042114, "kl": 0.009744289331138134, "learning_rate": 2.9078668289317597e-07, "loss": 9.748339653015137e-05, "reward": 0.9741071462631226, "reward_std": 0.053986482322216034, "rewards/DrugCombAccuracyCOTORM/mean": 0.9702380895614624, "rewards/DrugCombAccuracyCOTORM/std": 0.08844845741987228, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 9162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 469.0, "completions/min_length": 426.0, "epoch": 13.475, "frac_reward_zero_std": 1.0, "grad_norm": 0.015116426162421703, "kl": 0.009529886185191572, "learning_rate": 2.906701312312861e-07, "loss": 9.507763752480969e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/mean_length": 482.0, "completions/min_length": 386.0, "epoch": 13.476470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8514024019241333, "kl": 0.010225610574707389, "learning_rate": 2.9055359335951813e-07, "loss": 0.00010286111501045525, "reward": 0.6410714387893677, "reward_std": 0.18077217042446136, "rewards/DrugCombAccuracyCOTORM/mean": 0.5982142686843872, "rewards/DrugCombAccuracyCOTORM/std": 0.489950031042099, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 9164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 447.6875, "completions/min_length": 377.0, "epoch": 13.477941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 1.3252859115600586, "kl": 0.009026404470205307, "learning_rate": 2.904370692855495e-07, "loss": 9.006261825561523e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/mean_length": 486.0625, "completions/min_length": 389.0, "epoch": 13.479411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9061381220817566, "kl": 0.00889527634717524, "learning_rate": 2.903205590170563e-07, "loss": 8.94031545612961e-05, "reward": 0.9300416707992554, "reward_std": 0.15828248858451843, "rewards/DrugCombAccuracyCOTORM/mean": 0.92166668176651, "rewards/DrugCombAccuracyCOTORM/std": 0.25377157330513, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9270833134651184, "rewards/DrugCombCoverageCOTORM/std": 0.25069350004196167, "step": 9166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 465.75, "completions/min_length": 407.0, "epoch": 13.480882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.1080501079559326, "kl": 0.012501362012699246, "learning_rate": 2.902040625617141e-07, "loss": 0.00012461841106414795, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 9167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 448.9375, "completions/min_length": 404.0, "epoch": 13.48235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.011425784789025784, "kl": 0.007668232079595327, "learning_rate": 2.900875799271973e-07, "loss": 7.65142249292694e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 498.125, "completions/min_length": 451.0, "epoch": 13.483823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 1.2212380170822144, "kl": 0.01085873565170914, "learning_rate": 2.8997111112117955e-07, "loss": 0.00010874122381210327, "reward": 0.35624998807907104, "reward_std": 0.20257359743118286, "rewards/DrugCombAccuracyCOTORM/mean": 0.3229166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.4491504430770874, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.020833313465118408, "rewards/DrugCombCoverageCOTORM/std": 0.9287087917327881, "step": 9169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 449.4375, "completions/min_length": 406.0, "epoch": 13.485294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.776054322719574, "kl": 0.0084057702915743, "learning_rate": 2.898546561513334e-07, "loss": 8.404254913330078e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 453.3125, "completions/min_length": 405.0, "epoch": 13.486764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.011380725540220737, "kl": 0.010390363400802016, "learning_rate": 2.897382150253308e-07, "loss": 0.000104726423160173, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 421.5625, "completions/min_length": 349.0, "epoch": 13.488235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.009239482693374157, "kl": 0.00956854538526386, "learning_rate": 2.896217877508426e-07, "loss": 9.546622459311038e-05, "reward": 0.6713333129882812, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6100000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.40279027819633484, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 9172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 529.25, "completions/min_length": 458.0, "epoch": 13.489705882352942, "frac_reward_zero_std": 0.0, "grad_norm": 1.3545503616333008, "kl": 0.009092751191928983, "learning_rate": 2.8950537433553845e-07, "loss": 9.094178676605225e-05, "reward": 0.3687500059604645, "reward_std": 0.3743184804916382, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 9173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 416.875, "completions/min_length": 385.0, "epoch": 13.491176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.011568635702133179, "kl": 0.00791382952593267, "learning_rate": 2.893889747870876e-07, "loss": 7.919006748124957e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 467.1875, "completions/min_length": 415.0, "epoch": 13.492647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.1843096017837524, "kl": 0.012007172917947173, "learning_rate": 2.8927258911315806e-07, "loss": 0.00012019602581858635, "reward": 0.78125, "reward_std": 0.22190329432487488, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 9175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 474.4375, "completions/min_length": 416.0, "epoch": 13.494117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.019810974597930908, "kl": 0.010591378435492516, "learning_rate": 2.8915621732141705e-07, "loss": 0.0001062258961610496, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 428.3125, "completions/min_length": 371.0, "epoch": 13.495588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.021809449419379234, "kl": 0.011055052047595382, "learning_rate": 2.8903985941953083e-07, "loss": 0.000110737128125038, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 442.0, "completions/min_length": 378.0, "epoch": 13.49705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9894749522209167, "kl": 0.012119219172745943, "learning_rate": 2.8892351541516485e-07, "loss": 0.00012129145034123212, "reward": 0.574999988079071, "reward_std": 0.04629100486636162, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 9178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 422.0, "completions/min_length": 366.0, "epoch": 13.498529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.008088032715022564, "kl": 0.007564458297565579, "learning_rate": 2.888071853159835e-07, "loss": 7.550196460215375e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 450.875, "completions/min_length": 374.0, "epoch": 13.5, "frac_reward_zero_std": 0.5, "grad_norm": 0.8073834776878357, "kl": 0.00997833046130836, "learning_rate": 2.8869086912965036e-07, "loss": 9.892570233205333e-05, "reward": 0.9104166626930237, "reward_std": 0.15759539604187012, "rewards/DrugCombAccuracyCOTORM/mean": 0.8958333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.26440009474754333, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 9180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 400.375, "completions/min_length": 325.0, "epoch": 13.501470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.9751839637756348, "kl": 0.010773375513963401, "learning_rate": 2.885745668638279e-07, "loss": 0.00010818812006618828, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 428.625, "completions/min_length": 359.0, "epoch": 13.50294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.008659573271870613, "kl": 0.007162171066738665, "learning_rate": 2.88458278526178e-07, "loss": 7.18643786967732e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 9182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 436.1875, "completions/min_length": 391.0, "epoch": 13.504411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.014217033050954342, "kl": 0.008455182891339064, "learning_rate": 2.883420041243614e-07, "loss": 8.515974332112819e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 468.0625, "completions/min_length": 418.0, "epoch": 13.505882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.8425778150558472, "kl": 0.007008336950093508, "learning_rate": 2.8822574366603804e-07, "loss": 7.011914567556232e-05, "reward": 0.8374999761581421, "reward_std": 0.22638463973999023, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 9184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/mean_length": 497.25, "completions/min_length": 433.0, "epoch": 13.507352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.827479362487793, "kl": 0.007063104189001024, "learning_rate": 2.8810949715886657e-07, "loss": 7.098168134689331e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 9185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 456.5625, "completions/min_length": 398.0, "epoch": 13.508823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.01253954041749239, "kl": 0.008393378113396466, "learning_rate": 2.879932646105052e-07, "loss": 8.392676681978628e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/mean_length": 400.125, "completions/min_length": 367.0, "epoch": 13.510294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.02055935747921467, "kl": 0.007358446717262268, "learning_rate": 2.8787704602861106e-07, "loss": 7.355779234785587e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 440.8125, "completions/min_length": 383.0, "epoch": 13.511764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.013800921849906445, "kl": 0.007271333131939173, "learning_rate": 2.8776084142084034e-07, "loss": 7.355926209129393e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 9188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 457.5625, "completions/min_length": 386.0, "epoch": 13.513235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01395732443779707, "kl": 0.0094026995357126, "learning_rate": 2.8764465079484823e-07, "loss": 9.398672409588471e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 420.9375, "completions/min_length": 351.0, "epoch": 13.514705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.017597725614905357, "kl": 0.008056605700403452, "learning_rate": 2.875284741582892e-07, "loss": 8.08419135864824e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 9190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 433.5625, "completions/min_length": 394.0, "epoch": 13.516176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9630652666091919, "kl": 0.010891400859691203, "learning_rate": 2.874123115188165e-07, "loss": 0.00010841741459444165, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 451.25, "completions/min_length": 368.0, "epoch": 13.51764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.020878741517663002, "kl": 0.011543993838131428, "learning_rate": 2.8729616288408274e-07, "loss": 0.00011323841317789629, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 456.3125, "completions/min_length": 397.0, "epoch": 13.519117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.9787125587463379, "kl": 0.009586563101038337, "learning_rate": 2.871800282617395e-07, "loss": 9.551864786772057e-05, "reward": 0.8374999761581421, "reward_std": 0.22480149567127228, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 9193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 421.0625, "completions/min_length": 364.0, "epoch": 13.520588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.010314960032701492, "kl": 0.008635445614345372, "learning_rate": 2.8706390765943745e-07, "loss": 8.531797357136384e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/mean_length": 427.4375, "completions/min_length": 362.0, "epoch": 13.522058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 0.9868772625923157, "kl": 0.00846214359626174, "learning_rate": 2.869478010848262e-07, "loss": 8.319220796693116e-05, "reward": 0.9178333282470703, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.9025000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.26642072200775146, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 9195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 418.8125, "completions/min_length": 375.0, "epoch": 13.523529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.012642157264053822, "kl": 0.007525051012635231, "learning_rate": 2.8683170854555495e-07, "loss": 7.562692189821973e-05, "reward": 0.6713333129882812, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.6100000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.40279027819633484, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 9196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 428.375, "completions/min_length": 394.0, "epoch": 13.525, "frac_reward_zero_std": 0.5, "grad_norm": 1.3039895296096802, "kl": 0.011734871193766594, "learning_rate": 2.86715630049271e-07, "loss": 0.00011782375804614276, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 445.75, "completions/min_length": 404.0, "epoch": 13.526470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.02186639793217182, "kl": 0.0084792006528005, "learning_rate": 2.865995656036216e-07, "loss": 8.394368342123926e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 440.75, "completions/min_length": 374.0, "epoch": 13.527941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.8761469721794128, "kl": 0.00887881382368505, "learning_rate": 2.864835152162528e-07, "loss": 8.871831960277632e-05, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 9199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 430.0, "completions/min_length": 331.0, "epoch": 13.529411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.008644421584904194, "kl": 0.006237796740606427, "learning_rate": 2.863674788948097e-07, "loss": 6.227805715752766e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 419.0, "completions/min_length": 369.0, "epoch": 13.530882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.044481534510850906, "kl": 0.009807082009501755, "learning_rate": 2.862514566469364e-07, "loss": 9.714765474200249e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 477.25, "completions/min_length": 429.0, "epoch": 13.532352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.748514711856842, "kl": 0.010012057959102094, "learning_rate": 2.86135448480276e-07, "loss": 9.961426258087158e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 433.125, "completions/min_length": 390.0, "epoch": 13.533823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.052459716796875, "kl": 0.01049539283849299, "learning_rate": 2.860194544024712e-07, "loss": 0.00010479902994120494, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 484.5625, "completions/min_length": 429.0, "epoch": 13.535294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.9575408101081848, "kl": 0.00923719338607043, "learning_rate": 2.859034744211633e-07, "loss": 9.188055992126465e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 445.8125, "completions/min_length": 406.0, "epoch": 13.536764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0636918544769287, "kl": 0.02135896950494498, "learning_rate": 2.857875085439929e-07, "loss": 0.0002026483416557312, "reward": 0.6150833368301392, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5475000143051147, "rewards/DrugCombAccuracyCOTORM/std": 0.41562002897262573, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7708333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.26440009474754333, "step": 9205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 451.6875, "completions/min_length": 397.0, "epoch": 13.538235294117648, "frac_reward_zero_std": 1.0, "grad_norm": 0.011188698001205921, "kl": 0.008354439167305827, "learning_rate": 2.856715567785991e-07, "loss": 8.373991295229644e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 439.8125, "completions/min_length": 414.0, "epoch": 13.53970588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.009718891233205795, "kl": 0.00729034561663866, "learning_rate": 2.855556191326207e-07, "loss": 7.323045429075137e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/mean_length": 541.375, "completions/min_length": 505.0, "epoch": 13.541176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9163792133331299, "kl": 0.008166411193087697, "learning_rate": 2.8543969561369554e-07, "loss": 8.135730604408309e-05, "reward": 0.9178333282470703, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.9025000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.26642072200775146, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 9208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 496.1875, "completions/min_length": 437.0, "epoch": 13.54264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.1847167015075684, "kl": 0.010868292534723878, "learning_rate": 2.8532378622946015e-07, "loss": 0.00010835379362106323, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 460.5625, "completions/min_length": 418.0, "epoch": 13.544117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.8413310647010803, "kl": 0.012262301868759096, "learning_rate": 2.852078909875505e-07, "loss": 0.00012370324111543596, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 421.5, "completions/min_length": 382.0, "epoch": 13.545588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.01665639504790306, "kl": 0.00823561695870012, "learning_rate": 2.8509200989560134e-07, "loss": 8.216733112931252e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 431.875, "completions/min_length": 370.0, "epoch": 13.547058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.009965755976736546, "kl": 0.008662917651236057, "learning_rate": 2.849761429612468e-07, "loss": 8.62761662574485e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 421.625, "completions/min_length": 378.0, "epoch": 13.548529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.0161599051207304, "kl": 0.00983367336448282, "learning_rate": 2.8486029019211974e-07, "loss": 9.780771506484598e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/mean_length": 524.9375, "completions/min_length": 440.0, "epoch": 13.55, "frac_reward_zero_std": 0.0, "grad_norm": 1.1517865657806396, "kl": 0.010132367257028818, "learning_rate": 2.847444515958523e-07, "loss": 0.00010085850954055786, "reward": 0.59375, "reward_std": 0.48889654874801636, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.8139410614967346, "step": 9214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 502.9375, "completions/min_length": 432.0, "epoch": 13.551470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.06443723291158676, "kl": 0.013961822958663106, "learning_rate": 2.846286271800757e-07, "loss": 0.00014016000204719603, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 455.875, "completions/min_length": 383.0, "epoch": 13.552941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.8694433569908142, "kl": 0.014229993801563978, "learning_rate": 2.845128169524201e-07, "loss": 0.00014314055442810059, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 453.8125, "completions/min_length": 381.0, "epoch": 13.554411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.4530328512191772, "kl": 0.015000103740021586, "learning_rate": 2.8439702092051493e-07, "loss": 0.00015319883823394775, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 407.1875, "completions/min_length": 350.0, "epoch": 13.555882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.8132221698760986, "kl": 0.00975403399206698, "learning_rate": 2.8428123909198825e-07, "loss": 9.725587005959824e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 438.0625, "completions/min_length": 372.0, "epoch": 13.55735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0843539237976074, "kl": 0.008233062457293272, "learning_rate": 2.8416547147446764e-07, "loss": 8.2455575466156e-05, "reward": 0.5546875, "reward_std": 0.04997209087014198, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.8139410614967346, "step": 9219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 442.4375, "completions/min_length": 396.0, "epoch": 13.558823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.014402049593627453, "kl": 0.008026607451029122, "learning_rate": 2.840497180755795e-07, "loss": 8.09621560620144e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 430.1875, "completions/min_length": 371.0, "epoch": 13.560294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.1188682317733765, "kl": 0.011416663182899356, "learning_rate": 2.8393397890294946e-07, "loss": 0.00011468917364254594, "reward": 0.3373333811759949, "reward_std": 0.20299598574638367, "rewards/DrugCombAccuracyCOTORM/mean": 0.23416666686534882, "rewards/DrugCombAccuracyCOTORM/std": 0.42270293831825256, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 9221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 436.25, "completions/min_length": 396.0, "epoch": 13.561764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 0.872101366519928, "kl": 0.009716604836285114, "learning_rate": 2.8381825396420214e-07, "loss": 9.637021867092699e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/mean_length": 430.375, "completions/min_length": 396.0, "epoch": 13.563235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.015871714800596237, "kl": 0.010191960027441382, "learning_rate": 2.8370254326696106e-07, "loss": 0.00010124724940396845, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 416.125, "completions/min_length": 363.0, "epoch": 13.564705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9755021929740906, "kl": 0.011285158572718501, "learning_rate": 2.8358684681884915e-07, "loss": 0.0001129545271396637, "reward": 0.8464166522026062, "reward_std": 0.21689993143081665, "rewards/DrugCombAccuracyCOTORM/mean": 0.8262500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.3764195442199707, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5013870000839233, "step": 9224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 483.5, "completions/min_length": 428.0, "epoch": 13.566176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.008255604654550552, "kl": 0.007179946405813098, "learning_rate": 2.83471164627488e-07, "loss": 7.163251575548202e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 439.5, "completions/min_length": 373.0, "epoch": 13.56764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.014731151051819324, "kl": 0.009082058444619179, "learning_rate": 2.833554967004986e-07, "loss": 9.097796282730997e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 439.5625, "completions/min_length": 383.0, "epoch": 13.569117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.9929007887840271, "kl": 0.008492861757986248, "learning_rate": 2.8323984304550086e-07, "loss": 8.457087096758187e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 457.5, "completions/min_length": 364.0, "epoch": 13.570588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.9749695062637329, "kl": 0.01067118177888915, "learning_rate": 2.831242036701137e-07, "loss": 0.00010602176189422607, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 9228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 444.5, "completions/min_length": 398.0, "epoch": 13.572058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.013137275353074074, "kl": 0.008172406814992428, "learning_rate": 2.8300857858195533e-07, "loss": 8.153806265909225e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 437.8125, "completions/min_length": 387.0, "epoch": 13.573529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.012073301710188389, "kl": 0.006851874408312142, "learning_rate": 2.8289296778864244e-07, "loss": 6.756696529919282e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 437.625, "completions/min_length": 371.0, "epoch": 13.575, "frac_reward_zero_std": 0.5, "grad_norm": 0.9725028872489929, "kl": 0.012021461268886924, "learning_rate": 2.8277737129779147e-07, "loss": 0.0001207951718242839, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 9231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 471.625, "completions/min_length": 416.0, "epoch": 13.576470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.871590793132782, "kl": 0.011246784590184689, "learning_rate": 2.8266178911701754e-07, "loss": 0.00011273755080765113, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 462.9375, "completions/min_length": 390.0, "epoch": 13.577941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.035107120871543884, "kl": 0.008355009136721492, "learning_rate": 2.825462212539349e-07, "loss": 8.400125807384029e-05, "reward": 0.15000000596046448, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 9233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/mean_length": 479.375, "completions/min_length": 388.0, "epoch": 13.579411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 1.5220423936843872, "kl": 0.0230150509160012, "learning_rate": 2.824306677161569e-07, "loss": 0.00023066997528076172, "reward": 0.6812499761581421, "reward_std": 0.43991678953170776, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 9234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/mean_length": 468.125, "completions/min_length": 356.0, "epoch": 13.580882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.809889554977417, "kl": 0.010568234603852034, "learning_rate": 2.823151285112959e-07, "loss": 0.00010562429088167846, "reward": 0.9166666865348816, "reward_std": 0.0690065324306488, "rewards/DrugCombAccuracyCOTORM/mean": 0.8958333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.15957117080688477, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 447.75, "completions/min_length": 386.0, "epoch": 13.58235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.03218629211187363, "kl": 0.00968831975478679, "learning_rate": 2.8219960364696336e-07, "loss": 9.77362142293714e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/mean_length": 548.6875, "completions/min_length": 451.0, "epoch": 13.583823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 1.1260234117507935, "kl": 0.013182264054194093, "learning_rate": 2.8208409313076973e-07, "loss": 0.00013471022248268127, "reward": 0.9052083492279053, "reward_std": 0.16788682341575623, "rewards/DrugCombAccuracyCOTORM/mean": 0.8854166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.2083333432674408, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 9237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/mean_length": 413.8125, "completions/min_length": 370.0, "epoch": 13.58529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.03297276049852371, "kl": 0.01003883732482791, "learning_rate": 2.819685969703246e-07, "loss": 9.953492553904653e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 424.4375, "completions/min_length": 373.0, "epoch": 13.586764705882352, "frac_reward_zero_std": 1.0, "grad_norm": 0.03529013320803642, "kl": 0.01353909308090806, "learning_rate": 2.818531151732365e-07, "loss": 0.00013445827062241733, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 466.875, "completions/min_length": 405.0, "epoch": 13.588235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.010045953094959259, "kl": 0.006815921631641686, "learning_rate": 2.8173764774711315e-07, "loss": 6.863988528493792e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 420.4375, "completions/min_length": 362.0, "epoch": 13.589705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.032961759716272354, "kl": 0.011906553758308291, "learning_rate": 2.816221946995614e-07, "loss": 0.00011948305473197252, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 455.5625, "completions/min_length": 368.0, "epoch": 13.591176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.011159640736877918, "kl": 0.008950573392212391, "learning_rate": 2.8150675603818665e-07, "loss": 8.988644549390301e-05, "reward": 0.8416666984558105, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.8333333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.17213258147239685, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 9242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 447.0625, "completions/min_length": 406.0, "epoch": 13.592647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.015764594078064, "kl": 0.023101230151951313, "learning_rate": 2.813913317705938e-07, "loss": 0.00022867321968078613, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 9243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 424.0625, "completions/min_length": 350.0, "epoch": 13.594117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.011052210815250874, "kl": 0.007756576174870133, "learning_rate": 2.812759219043869e-07, "loss": 7.653184729861096e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 479.0625, "completions/min_length": 409.0, "epoch": 13.595588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.010939707979559898, "kl": 0.007998705375939608, "learning_rate": 2.811605264471686e-07, "loss": 8.0018253356684e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 413.625, "completions/min_length": 359.0, "epoch": 13.597058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.0208617448806763, "kl": 0.011776737170293927, "learning_rate": 2.810451454065411e-07, "loss": 0.00011695176362991333, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 9246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 501.625, "completions/min_length": 421.0, "epoch": 13.598529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.921609103679657, "kl": 0.010924495290964842, "learning_rate": 2.8092977879010524e-07, "loss": 0.00010884655057452619, "reward": 0.8374999761581421, "reward_std": 0.22638463973999023, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 9247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/mean_length": 523.1875, "completions/min_length": 413.0, "epoch": 13.6, "frac_reward_zero_std": 0.5, "grad_norm": 1.0018255710601807, "kl": 0.011166528100147843, "learning_rate": 2.808144266054612e-07, "loss": 0.0001121422610594891, "reward": 0.7920416593551636, "reward_std": 0.15928915143013, "rewards/DrugCombAccuracyCOTORM/mean": 0.7540029883384705, "rewards/DrugCombAccuracyCOTORM/std": 0.35837259888648987, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8883928656578064, "rewards/DrugCombCoverageCOTORM/std": 0.20028677582740784, "step": 9248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 485.4375, "completions/min_length": 436.0, "epoch": 13.601470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.012292214669287205, "kl": 0.009057968389242887, "learning_rate": 2.806990888602081e-07, "loss": 9.064864570973441e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 477.1875, "completions/min_length": 432.0, "epoch": 13.602941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.014848531223833561, "kl": 0.009368272498250008, "learning_rate": 2.80583765561944e-07, "loss": 9.38514422159642e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 458.9375, "completions/min_length": 367.0, "epoch": 13.604411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.7555266618728638, "kl": 0.014339232351630926, "learning_rate": 2.8046845671826613e-07, "loss": 0.0001444891095161438, "reward": 0.7437499761581421, "reward_std": 0.3729080259799957, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 9251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 476.0625, "completions/min_length": 429.0, "epoch": 13.605882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.016014503315091133, "kl": 0.00911819247994572, "learning_rate": 2.8035316233677075e-07, "loss": 9.171116107609123e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 413.4375, "completions/min_length": 375.0, "epoch": 13.60735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01363135315477848, "kl": 0.008377955411560833, "learning_rate": 2.8023788242505343e-07, "loss": 8.364747191080824e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 437.4375, "completions/min_length": 392.0, "epoch": 13.608823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.9012792110443115, "kl": 0.01304687187075615, "learning_rate": 2.8012261699070806e-07, "loss": 0.00012945273192599416, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 420.6875, "completions/min_length": 374.0, "epoch": 13.610294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.010125021450221539, "kl": 0.007828297093510628, "learning_rate": 2.8000736604132817e-07, "loss": 7.830647518858314e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 409.4375, "completions/min_length": 386.0, "epoch": 13.611764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.017213162034749985, "kl": 0.0066356397001072764, "learning_rate": 2.7989212958450636e-07, "loss": 6.665146793238819e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 453.3125, "completions/min_length": 399.0, "epoch": 13.613235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.028093090280890465, "kl": 0.009606002597138286, "learning_rate": 2.79776907627834e-07, "loss": 9.546722139930353e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 477.25, "completions/min_length": 415.0, "epoch": 13.614705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 0.9778887033462524, "kl": 0.008678822312504053, "learning_rate": 2.796617001789016e-07, "loss": 8.732984133530408e-05, "reward": 0.7171875238418579, "reward_std": 0.23422911763191223, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 9258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 454.0625, "completions/min_length": 366.0, "epoch": 13.616176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.9193587899208069, "kl": 0.0111765475012362, "learning_rate": 2.795465072452988e-07, "loss": 0.00011110452760476619, "reward": 0.9178333282470703, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.9025000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.26642072200775146, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 9259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 447.9375, "completions/min_length": 400.0, "epoch": 13.617647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.014340797439217567, "kl": 0.0067100574960932136, "learning_rate": 2.794313288346143e-07, "loss": 6.695310003124177e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 409.0, "completions/min_length": 355.0, "epoch": 13.619117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.8828135132789612, "kl": 0.009750050725415349, "learning_rate": 2.793161649544356e-07, "loss": 9.779111132957041e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/mean_length": 396.875, "completions/min_length": 337.0, "epoch": 13.620588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.010732335038483143, "kl": 0.007183428155258298, "learning_rate": 2.792010156123495e-07, "loss": 7.163525151554495e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/mean_length": 500.1875, "completions/min_length": 406.0, "epoch": 13.62205882352941, "frac_reward_zero_std": 0.0, "grad_norm": 1.1844533681869507, "kl": 0.00930330972187221, "learning_rate": 2.7908588081594163e-07, "loss": 9.337067604064941e-05, "reward": 0.2562500238418579, "reward_std": 0.3005203902721405, "rewards/DrugCombAccuracyCOTORM/mean": 0.125, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 9263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 412.375, "completions/min_length": 326.0, "epoch": 13.623529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.018309615552425385, "kl": 0.008509261417202652, "learning_rate": 2.78970760572797e-07, "loss": 8.506049925927073e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 418.75, "completions/min_length": 338.0, "epoch": 13.625, "frac_reward_zero_std": 0.5, "grad_norm": 0.9732556343078613, "kl": 0.01747172581963241, "learning_rate": 2.7885565489049946e-07, "loss": 0.00017640841542743146, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 9265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 438.4375, "completions/min_length": 380.0, "epoch": 13.626470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.0062052011489868, "kl": 0.010228408151306212, "learning_rate": 2.787405637766315e-07, "loss": 9.867106564342976e-05, "reward": 0.7648749947547913, "reward_std": 0.14512228965759277, "rewards/DrugCombAccuracyCOTORM/mean": 0.7178124785423279, "rewards/DrugCombAccuracyCOTORM/std": 0.3762499988079071, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 9266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 454.375, "completions/min_length": 385.0, "epoch": 13.62794117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.245476245880127, "kl": 0.012166321626864374, "learning_rate": 2.786254872387752e-07, "loss": 0.0001198424506583251, "reward": 0.7749999761581421, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 9267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 427.625, "completions/min_length": 329.0, "epoch": 13.629411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.013540147803723812, "kl": 0.006787204765714705, "learning_rate": 2.785104252845117e-07, "loss": 6.790879706386477e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/mean_length": 553.1875, "completions/min_length": 507.0, "epoch": 13.630882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 1.2575409412384033, "kl": 0.011696208734065294, "learning_rate": 2.783953779214208e-07, "loss": 0.00011630356311798096, "reward": 0.42475417256355286, "reward_std": 0.08715277910232544, "rewards/DrugCombAccuracyCOTORM/mean": 0.3121927082538605, "rewards/DrugCombAccuracyCOTORM/std": 0.17976342141628265, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.1521451622247696, "step": 9269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 448.6875, "completions/min_length": 393.0, "epoch": 13.632352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.008803216740489006, "kl": 0.0074491085251793265, "learning_rate": 2.7828034515708153e-07, "loss": 7.484723755624145e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 472.0625, "completions/min_length": 413.0, "epoch": 13.633823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.02106711082160473, "kl": 0.010173293529078364, "learning_rate": 2.781653269990721e-07, "loss": 0.00010209674655925483, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 447.5625, "completions/min_length": 411.0, "epoch": 13.635294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.022750984877347946, "kl": 0.0076110007939860225, "learning_rate": 2.7805032345496955e-07, "loss": 7.607491716044024e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 427.0625, "completions/min_length": 377.0, "epoch": 13.636764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8718445897102356, "kl": 0.011936560040339828, "learning_rate": 2.7793533453234996e-07, "loss": 0.00011964835721300915, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 450.4375, "completions/min_length": 389.0, "epoch": 13.638235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.010002254508435726, "kl": 0.008330719894729555, "learning_rate": 2.7782036023878856e-07, "loss": 8.327873365487903e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 434.5625, "completions/min_length": 354.0, "epoch": 13.639705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.1372535228729248, "kl": 0.009587127598933876, "learning_rate": 2.777054005818596e-07, "loss": 9.530968964099884e-05, "reward": 0.16233333945274353, "reward_std": 0.047967579215765, "rewards/DrugCombAccuracyCOTORM/mean": 0.020625000819563866, "rewards/DrugCombAccuracyCOTORM/std": 0.05635823681950569, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4583333432674408, "rewards/DrugCombCoverageCOTORM/std": 0.7340905666351318, "step": 9275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 783.0, "completions/mean_length": 542.25, "completions/min_length": 407.0, "epoch": 13.641176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.7025684118270874, "kl": 0.008204970392398536, "learning_rate": 2.775904555691363e-07, "loss": 8.240150782512501e-05, "reward": 0.6822138428688049, "reward_std": 0.17980249226093292, "rewards/DrugCombAccuracyCOTORM/mean": 0.6546645760536194, "rewards/DrugCombAccuracyCOTORM/std": 0.4289226233959198, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5848214626312256, "rewards/DrugCombCoverageCOTORM/std": 0.8023005723953247, "step": 9276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 447.5, "completions/min_length": 384.0, "epoch": 13.64264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0013759136199951, "kl": 0.008661167463287711, "learning_rate": 2.7747552520819105e-07, "loss": 8.665025234222412e-05, "reward": 0.6875, "reward_std": 0.19594095647335052, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 9277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 471.0625, "completions/min_length": 380.0, "epoch": 13.644117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.9435115456581116, "kl": 0.01063149911351502, "learning_rate": 2.773606095065949e-07, "loss": 0.00010675928206183016, "reward": 0.9333333373069763, "reward_std": 0.14253933727741241, "rewards/DrugCombAccuracyCOTORM/mean": 0.9166666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 442.3125, "completions/min_length": 354.0, "epoch": 13.645588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.01641410030424595, "kl": 0.00988384650554508, "learning_rate": 2.772457084719184e-07, "loss": 9.858304110821337e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 447.9375, "completions/min_length": 413.0, "epoch": 13.647058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 0.9958705902099609, "kl": 0.010547623969614506, "learning_rate": 2.771308221117309e-07, "loss": 0.00010604411363601685, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 9280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 746.0, "completions/mean_length": 522.4375, "completions/min_length": 417.0, "epoch": 13.648529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.875918984413147, "kl": 0.007953356835059822, "learning_rate": 2.770159504336008e-07, "loss": 8.030980825424194e-05, "reward": 0.9108333587646484, "reward_std": 0.17344090342521667, "rewards/DrugCombAccuracyCOTORM/mean": 0.9041666984558105, "rewards/DrugCombAccuracyCOTORM/std": 0.258736252784729, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 9281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 484.0, "completions/min_length": 425.0, "epoch": 13.65, "frac_reward_zero_std": 0.5, "grad_norm": 0.9977833032608032, "kl": 0.008698025369085371, "learning_rate": 2.769010934450956e-07, "loss": 8.716483716852963e-05, "reward": 0.75, "reward_std": 0.26726123690605164, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 9282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 442.25, "completions/min_length": 372.0, "epoch": 13.651470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.8774454593658447, "kl": 0.013496559113264084, "learning_rate": 2.7678625115378173e-07, "loss": 0.00013621896505355835, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 9283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 471.375, "completions/min_length": 424.0, "epoch": 13.652941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.7855691313743591, "kl": 0.0074277009116485715, "learning_rate": 2.7667142356722485e-07, "loss": 7.358589937211946e-05, "reward": 0.8767499923706055, "reward_std": 0.17010116577148438, "rewards/DrugCombAccuracyCOTORM/mean": 0.8537499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.31442803144454956, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.13437095284461975, "step": 9284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/mean_length": 476.6875, "completions/min_length": 393.0, "epoch": 13.654411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.8733282685279846, "kl": 0.01109215070027858, "learning_rate": 2.765566106929893e-07, "loss": 0.00011086182348662987, "reward": 0.9178333282470703, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.9025000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.26642072200775146, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 9285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/mean_length": 421.125, "completions/min_length": 343.0, "epoch": 13.655882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.14972984790802, "kl": 0.012252868502400815, "learning_rate": 2.764418125386388e-07, "loss": 0.0001203205029014498, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/mean_length": 466.4375, "completions/min_length": 390.0, "epoch": 13.657352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.6981866359710693, "kl": 0.006324621965177357, "learning_rate": 2.76327029111736e-07, "loss": 6.289780139923096e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 9287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 468.5625, "completions/min_length": 426.0, "epoch": 13.658823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.4634077548980713, "kl": 0.010254981694743037, "learning_rate": 2.7621226041984236e-07, "loss": 0.00010119137004949152, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 484.1875, "completions/min_length": 415.0, "epoch": 13.660294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.016121679916977882, "kl": 0.0075715008424595, "learning_rate": 2.76097506470519e-07, "loss": 7.562592509202659e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 489.0, "completions/min_length": 453.0, "epoch": 13.661764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9938713908195496, "kl": 0.008972172741778195, "learning_rate": 2.75982767271325e-07, "loss": 8.996576070785522e-05, "reward": 0.925000011920929, "reward_std": 0.1752549111843109, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 9290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 444.3125, "completions/min_length": 404.0, "epoch": 13.663235294117648, "frac_reward_zero_std": 0.5, "grad_norm": 1.1181434392929077, "kl": 0.011263267369940877, "learning_rate": 2.758680428298194e-07, "loss": 0.00011286884546279907, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 9291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 507.3125, "completions/min_length": 438.0, "epoch": 13.66470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.7823795080184937, "kl": 0.007666977355256677, "learning_rate": 2.757533331535599e-07, "loss": 7.722921873209998e-05, "reward": 0.9364595413208008, "reward_std": 0.12421141564846039, "rewards/DrugCombAccuracyCOTORM/mean": 0.9231785535812378, "rewards/DrugCombAccuracyCOTORM/std": 0.21843455731868744, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 9292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 444.5, "completions/min_length": 384.0, "epoch": 13.666176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.9693924188613892, "kl": 0.012044730363413692, "learning_rate": 2.7563863825010335e-07, "loss": 0.00012025237083435059, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 9293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 446.9375, "completions/min_length": 401.0, "epoch": 13.66764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.019616588950157166, "kl": 0.009297730517573655, "learning_rate": 2.7552395812700553e-07, "loss": 9.302777471020818e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/mean_length": 477.6875, "completions/min_length": 373.0, "epoch": 13.669117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.8554431796073914, "kl": 0.008048143703490496, "learning_rate": 2.7540929279182125e-07, "loss": 8.130073547363281e-05, "reward": 0.7958333492279053, "reward_std": 0.19553548097610474, "rewards/DrugCombAccuracyCOTORM/mean": 0.78125, "rewards/DrugCombAccuracyCOTORM/std": 0.36371922492980957, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7083333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5426273345947266, "step": 9295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/mean_length": 484.6875, "completions/min_length": 380.0, "epoch": 13.670588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.01696901023387909, "kl": 0.008206830592826009, "learning_rate": 2.752946422521044e-07, "loss": 8.228256774600595e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 9296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 471.875, "completions/min_length": 400.0, "epoch": 13.672058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.8779864311218262, "kl": 0.012141414685174823, "learning_rate": 2.751800065154078e-07, "loss": 0.00012093037366867065, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 9297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 469.0625, "completions/min_length": 409.0, "epoch": 13.673529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.927039623260498, "kl": 0.009335752576589584, "learning_rate": 2.7506538558928355e-07, "loss": 9.267973655369133e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 417.8125, "completions/min_length": 393.0, "epoch": 13.675, "frac_reward_zero_std": 1.0, "grad_norm": 0.00793379545211792, "kl": 0.006745253456756473, "learning_rate": 2.749507794812824e-07, "loss": 6.705483247060329e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 474.0, "completions/min_length": 398.0, "epoch": 13.676470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 0.9884939789772034, "kl": 0.010140398517251015, "learning_rate": 2.7483618819895457e-07, "loss": 0.00010199719690717757, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 417.0625, "completions/min_length": 341.0, "epoch": 13.677941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.009701631963253021, "kl": 0.006719217053614557, "learning_rate": 2.7472161174984866e-07, "loss": 6.702885730192065e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/mean_length": 482.75, "completions/min_length": 416.0, "epoch": 13.679411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.8087630271911621, "kl": 0.010929777286946774, "learning_rate": 2.7460705014151283e-07, "loss": 0.0001082464077626355, "reward": 0.9052583575248718, "reward_std": 0.11090248078107834, "rewards/DrugCombAccuracyCOTORM/mean": 0.8984999656677246, "rewards/DrugCombAccuracyCOTORM/std": 0.2119968831539154, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8645833730697632, "rewards/DrugCombCoverageCOTORM/std": 0.18477964401245117, "step": 9302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 430.875, "completions/min_length": 391.0, "epoch": 13.680882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.015277231112122536, "kl": 0.009805245557799935, "learning_rate": 2.744925033814941e-07, "loss": 9.783064160728827e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 414.3125, "completions/min_length": 362.0, "epoch": 13.68235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01586071401834488, "kl": 0.010012858314439654, "learning_rate": 2.743779714773386e-07, "loss": 0.00010073304292745888, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 463.8125, "completions/min_length": 404.0, "epoch": 13.683823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.03518439829349518, "kl": 0.013812146848067641, "learning_rate": 2.742634544365914e-07, "loss": 0.00013781350571662188, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/mean_length": 493.6875, "completions/min_length": 424.0, "epoch": 13.685294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.1063843965530396, "kl": 0.013057762524113059, "learning_rate": 2.7414895226679644e-07, "loss": 0.00012979996972717345, "reward": 0.6499999761581421, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 503.25, "completions/min_length": 437.0, "epoch": 13.686764705882354, "frac_reward_zero_std": 0.0, "grad_norm": 1.3054190874099731, "kl": 0.01344029838219285, "learning_rate": 2.740344649754971e-07, "loss": 0.00013484805822372437, "reward": 0.3125, "reward_std": 0.34973087906837463, "rewards/DrugCombAccuracyCOTORM/mean": 0.1875, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 9307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 442.9375, "completions/min_length": 366.0, "epoch": 13.688235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.011879269964993, "kl": 0.009098269045352936, "learning_rate": 2.739199925702352e-07, "loss": 9.089840750675648e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/mean_length": 476.0625, "completions/min_length": 380.0, "epoch": 13.689705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9864165186882019, "kl": 0.009310638415627182, "learning_rate": 2.738055350585522e-07, "loss": 9.057670831680298e-05, "reward": 0.9833333492279053, "reward_std": 0.047140445560216904, "rewards/DrugCombAccuracyCOTORM/mean": 0.9791666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.0833333283662796, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 439.1875, "completions/min_length": 331.0, "epoch": 13.691176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.01143890805542469, "kl": 0.007932942709885538, "learning_rate": 2.7369109244798805e-07, "loss": 7.93290018918924e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 463.8125, "completions/min_length": 410.0, "epoch": 13.69264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.019927384331822395, "kl": 0.009674621396698058, "learning_rate": 2.73576664746082e-07, "loss": 9.631457942305133e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 435.875, "completions/min_length": 381.0, "epoch": 13.694117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.912307620048523, "kl": 0.009994311607442796, "learning_rate": 2.7346225196037255e-07, "loss": 9.949102968676016e-05, "reward": 0.38749998807907104, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.375, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.125, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 9312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 420.5625, "completions/min_length": 366.0, "epoch": 13.695588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.014821725897490978, "kl": 0.007868236978538334, "learning_rate": 2.733478540983963e-07, "loss": 7.910936983535066e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/mean_length": 473.875, "completions/min_length": 429.0, "epoch": 13.697058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.6368549466133118, "kl": 0.007816420518793166, "learning_rate": 2.7323347116769e-07, "loss": 7.783621549606323e-05, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 9314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/mean_length": 444.75, "completions/min_length": 356.0, "epoch": 13.698529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 1.0685169696807861, "kl": 0.011547932866960764, "learning_rate": 2.731191031757887e-07, "loss": 0.00011551652278285474, "reward": 0.8677500486373901, "reward_std": 0.10868926346302032, "rewards/DrugCombAccuracyCOTORM/mean": 0.8425000309944153, "rewards/DrugCombAccuracyCOTORM/std": 0.24315062165260315, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.08333335071802139, "step": 9315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/mean_length": 419.8125, "completions/min_length": 372.0, "epoch": 13.7, "frac_reward_zero_std": 1.0, "grad_norm": 0.012389031238853931, "kl": 0.012554045068100095, "learning_rate": 2.730047501302266e-07, "loss": 0.0001267112820642069, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/mean_length": 474.1875, "completions/min_length": 380.0, "epoch": 13.701470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.43623971939086914, "kl": 0.020118003245443106, "learning_rate": 2.7289041203853726e-07, "loss": 0.00020598933042492718, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 485.8125, "completions/min_length": 439.0, "epoch": 13.702941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.009786118753254414, "kl": 0.007328800740651786, "learning_rate": 2.7277608890825276e-07, "loss": 7.369853847194463e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 448.0625, "completions/min_length": 371.0, "epoch": 13.704411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.9681474566459656, "kl": 0.012614176608622074, "learning_rate": 2.726617807469045e-07, "loss": 0.00012600333138834685, "reward": 0.737500011920929, "reward_std": 0.2199837565422058, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 9319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 470.625, "completions/min_length": 383.0, "epoch": 13.705882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.891008734703064, "kl": 0.010263502597808838, "learning_rate": 2.725474875620228e-07, "loss": 0.00010298365668859333, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 9320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 480.1875, "completions/min_length": 394.0, "epoch": 13.70735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01729664020240307, "kl": 0.008759335265494883, "learning_rate": 2.72433209361137e-07, "loss": 8.76050180522725e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/mean_length": 456.9375, "completions/min_length": 357.0, "epoch": 13.708823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.799854576587677, "kl": 0.011124200653284788, "learning_rate": 2.723189461517754e-07, "loss": 0.00011427052959334105, "reward": 0.8995416760444641, "reward_std": 0.12100478261709213, "rewards/DrugCombAccuracyCOTORM/mean": 0.8887500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.2226669192314148, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8854166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.2083333432674408, "step": 9322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 416.6875, "completions/min_length": 333.0, "epoch": 13.71029411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.0831462144851685, "kl": 0.011665860889479518, "learning_rate": 2.722046979414656e-07, "loss": 0.00011766001262003556, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 471.125, "completions/min_length": 416.0, "epoch": 13.711764705882352, "frac_reward_zero_std": 1.0, "grad_norm": 0.02099495567381382, "kl": 0.007540811435319483, "learning_rate": 2.720904647377339e-07, "loss": 7.556835043942556e-05, "reward": 0.6410000324249268, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5824999809265137, "rewards/DrugCombAccuracyCOTORM/std": 0.43119215965270996, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 9324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 451.5, "completions/min_length": 401.0, "epoch": 13.713235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.013090211898088455, "kl": 0.00916592765133828, "learning_rate": 2.7197624654810546e-07, "loss": 9.214120655087754e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 439.1875, "completions/min_length": 390.0, "epoch": 13.714705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.0169615745544434, "kl": 0.007968494086526334, "learning_rate": 2.7186204338010487e-07, "loss": 7.973135507199913e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 9326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/mean_length": 494.25, "completions/min_length": 396.0, "epoch": 13.716176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.0228207111358643, "kl": 0.008417106117121875, "learning_rate": 2.7174785524125545e-07, "loss": 8.41928122099489e-05, "reward": 0.7662020921707153, "reward_std": 0.09956257790327072, "rewards/DrugCombAccuracyCOTORM/mean": 0.7145885229110718, "rewards/DrugCombAccuracyCOTORM/std": 0.33759036660194397, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9453125, "rewards/DrugCombCoverageCOTORM/std": 0.10174263268709183, "step": 9327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 435.375, "completions/min_length": 379.0, "epoch": 13.717647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.027954982593655586, "kl": 0.01013999548740685, "learning_rate": 2.716336821390797e-07, "loss": 0.00010180856043007225, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 498.125, "completions/min_length": 432.0, "epoch": 13.719117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.846914529800415, "kl": 0.010893438709899783, "learning_rate": 2.715195240810991e-07, "loss": 0.00010905622184509411, "reward": 0.5089166760444641, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.45125001668930054, "rewards/DrugCombAccuracyCOTORM/std": 0.502684473991394, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5013870000839233, "step": 9329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 471.875, "completions/min_length": 391.0, "epoch": 13.720588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.7992490530014038, "kl": 0.01141670485958457, "learning_rate": 2.7140538107483403e-07, "loss": 0.00011398643255233765, "reward": 0.6337916851043701, "reward_std": 0.020388245582580566, "rewards/DrugCombAccuracyCOTORM/mean": 0.5721874833106995, "rewards/DrugCombAccuracyCOTORM/std": 0.44363635778427124, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7604166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.25069350004196167, "step": 9330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 444.25, "completions/min_length": 394.0, "epoch": 13.722058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.009062311612069607, "kl": 0.010341737302951515, "learning_rate": 2.712912531278039e-07, "loss": 0.00010366339120082557, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 425.5, "completions/min_length": 381.0, "epoch": 13.723529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.00988207571208477, "kl": 0.006799872382543981, "learning_rate": 2.7117714024752725e-07, "loss": 6.80971861584112e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/mean_length": 508.8125, "completions/min_length": 450.0, "epoch": 13.725, "frac_reward_zero_std": 0.0, "grad_norm": 1.235295295715332, "kl": 0.009844184969551861, "learning_rate": 2.7106304244152155e-07, "loss": 9.741634130477905e-05, "reward": 0.668749988079071, "reward_std": 0.3743184804916382, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 9333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 452.625, "completions/min_length": 360.0, "epoch": 13.726470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.0775631666183472, "kl": 0.012163135688751936, "learning_rate": 2.7094895971730324e-07, "loss": 0.00012089646770618856, "reward": 0.6625000238418579, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 9334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 444.0, "completions/min_length": 410.0, "epoch": 13.727941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.012082346715033054, "kl": 0.01028580334968865, "learning_rate": 2.708348920823878e-07, "loss": 0.000102821177279111, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 437.5625, "completions/min_length": 385.0, "epoch": 13.729411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.02095028944313526, "kl": 0.008502859971486032, "learning_rate": 2.7072083954428993e-07, "loss": 8.545227319700643e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/mean_length": 468.6875, "completions/min_length": 416.0, "epoch": 13.730882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.0810108184814453, "kl": 0.009568812092766166, "learning_rate": 2.7060680211052266e-07, "loss": 9.695440530776978e-05, "reward": 0.5874999761581421, "reward_std": 0.172688826918602, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 9337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 465.125, "completions/min_length": 417.0, "epoch": 13.73235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.018666619434952736, "kl": 0.009149300283752382, "learning_rate": 2.7049277978859885e-07, "loss": 9.15933633223176e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/mean_length": 481.0, "completions/min_length": 397.0, "epoch": 13.733823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.055980920791626, "kl": 0.01146489498205483, "learning_rate": 2.703787725860298e-07, "loss": 0.0001149401068687439, "reward": 0.8528500199317932, "reward_std": 0.20320814847946167, "rewards/DrugCombAccuracyCOTORM/mean": 0.8207499980926514, "rewards/DrugCombAccuracyCOTORM/std": 0.3863793611526489, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9624999761581421, "rewards/DrugCombCoverageCOTORM/std": 0.15000000596046448, "step": 9339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 439.375, "completions/min_length": 380.0, "epoch": 13.735294117647058, "frac_reward_zero_std": 0.0, "grad_norm": 1.2931822538375854, "kl": 0.010724390391260386, "learning_rate": 2.702647805103262e-07, "loss": 0.0001074671745300293, "reward": 0.8937499523162842, "reward_std": 0.3005203604698181, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 9340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 439.6875, "completions/min_length": 378.0, "epoch": 13.736764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8183919787406921, "kl": 0.011093495297245681, "learning_rate": 2.701508035689974e-07, "loss": 0.00011110785999335349, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/mean_length": 523.0625, "completions/min_length": 459.0, "epoch": 13.738235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9246318340301514, "kl": 0.009256629040464759, "learning_rate": 2.7003684176955207e-07, "loss": 9.215610043611377e-05, "reward": 0.8967083692550659, "reward_std": 0.15700268745422363, "rewards/DrugCombAccuracyCOTORM/mean": 0.87479168176651, "rewards/DrugCombAccuracyCOTORM/std": 0.2892204523086548, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.08539126068353653, "step": 9342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/mean_length": 407.625, "completions/min_length": 353.0, "epoch": 13.739705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.026999343186616898, "kl": 0.010160380392335355, "learning_rate": 2.6992289511949755e-07, "loss": 0.00010074355668621138, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/mean_length": 528.375, "completions/min_length": 415.0, "epoch": 13.741176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.9309729337692261, "kl": 0.009951016516424716, "learning_rate": 2.698089636263405e-07, "loss": 9.946892532752827e-05, "reward": 0.904188871383667, "reward_std": 0.07569500803947449, "rewards/DrugCombAccuracyCOTORM/mean": 0.8967291712760925, "rewards/DrugCombAccuracyCOTORM/std": 0.1653784066438675, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8680555820465088, "rewards/DrugCombCoverageCOTORM/std": 0.2802666127681732, "step": 9344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 478.25, "completions/min_length": 417.0, "epoch": 13.742647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.0955013036727905, "kl": 0.010895724408328533, "learning_rate": 2.6969504729758636e-07, "loss": 0.00010916218161582947, "reward": 0.8615833520889282, "reward_std": 0.19194839894771576, "rewards/DrugCombAccuracyCOTORM/mean": 0.8400000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.3447704613208771, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8958333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.23471811413764954, "step": 9345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 441.0625, "completions/min_length": 404.0, "epoch": 13.744117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.03307730332016945, "kl": 0.010911815683357418, "learning_rate": 2.695811461407397e-07, "loss": 0.00010965488763758913, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 445.375, "completions/min_length": 415.0, "epoch": 13.745588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.019194792956113815, "kl": 0.009446600335650146, "learning_rate": 2.69467260163304e-07, "loss": 9.340163524029776e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 470.125, "completions/min_length": 423.0, "epoch": 13.74705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.01652219519019127, "kl": 0.008324510068632662, "learning_rate": 2.69353389372782e-07, "loss": 8.366710972040892e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 451.125, "completions/min_length": 417.0, "epoch": 13.748529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.01804172433912754, "kl": 0.007866734522394836, "learning_rate": 2.6923953377667473e-07, "loss": 7.898028707131743e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/mean_length": 484.8125, "completions/min_length": 343.0, "epoch": 13.75, "frac_reward_zero_std": 0.5, "grad_norm": 0.8669949173927307, "kl": 0.008853486273437738, "learning_rate": 2.6912569338248315e-07, "loss": 8.885562419891357e-05, "reward": 0.8661041855812073, "reward_std": 0.11545458436012268, "rewards/DrugCombAccuracyCOTORM/mean": 0.8384895920753479, "rewards/DrugCombAccuracyCOTORM/std": 0.25560280680656433, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.953125, "rewards/DrugCombCoverageCOTORM/std": 0.0625, "step": 9350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 469.6875, "completions/min_length": 410.0, "epoch": 13.751470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.8072167038917542, "kl": 0.01005578669719398, "learning_rate": 2.690118681977065e-07, "loss": 0.00010041147470474243, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 9351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 484.625, "completions/min_length": 436.0, "epoch": 13.75294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 1.2452503442764282, "kl": 0.010606607422232628, "learning_rate": 2.688980582298435e-07, "loss": 0.00010604411363601685, "reward": 0.6499999761581421, "reward_std": 0.39218372106552124, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 465.75, "completions/min_length": 421.0, "epoch": 13.754411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9663922786712646, "kl": 0.01208629528991878, "learning_rate": 2.6878426348639147e-07, "loss": 0.0001203135252580978, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 9353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/mean_length": 418.375, "completions/min_length": 394.0, "epoch": 13.755882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.8749173879623413, "kl": 0.008560091606341302, "learning_rate": 2.686704839748471e-07, "loss": 8.612604869995266e-05, "reward": 0.699999988079071, "reward_std": 0.2507132589817047, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.8944272398948669, "step": 9354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 427.5625, "completions/min_length": 388.0, "epoch": 13.757352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.09981979429721832, "kl": 0.009522864478640258, "learning_rate": 2.6855671970270576e-07, "loss": 9.470132499700412e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 491.0625, "completions/min_length": 401.0, "epoch": 13.758823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 1.1848222017288208, "kl": 0.01007925602607429, "learning_rate": 2.6844297067746215e-07, "loss": 0.00010045245289802551, "reward": 0.7572916746139526, "reward_std": 0.284759521484375, "rewards/DrugCombAccuracyCOTORM/mean": 0.7395833730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.3386725187301636, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.65625, "rewards/DrugCombCoverageCOTORM/std": 0.539096474647522, "step": 9356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/mean_length": 497.9375, "completions/min_length": 368.0, "epoch": 13.760294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.9675147533416748, "kl": 0.0094916116213426, "learning_rate": 2.6832923690660954e-07, "loss": 9.493157267570496e-05, "reward": 0.7242708206176758, "reward_std": 0.08397737145423889, "rewards/DrugCombAccuracyCOTORM/mean": 0.6891927123069763, "rewards/DrugCombAccuracyCOTORM/std": 0.34968477487564087, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7291666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.28463754057884216, "step": 9357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 461.1875, "completions/min_length": 408.0, "epoch": 13.761764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.049769118428230286, "kl": 0.011144340271130204, "learning_rate": 2.682155183976405e-07, "loss": 0.00011050270404666662, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 458.375, "completions/min_length": 397.0, "epoch": 13.763235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.2508190870285034, "kl": 0.010239180293865502, "learning_rate": 2.6810181515804655e-07, "loss": 0.00010256469249725342, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 417.0, "completions/min_length": 341.0, "epoch": 13.764705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.01485198363661766, "kl": 0.009959512390196323, "learning_rate": 2.679881271953184e-07, "loss": 0.00010055737220682204, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/mean_length": 472.9375, "completions/min_length": 401.0, "epoch": 13.766176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.8905049562454224, "kl": 0.008594778133556247, "learning_rate": 2.67874454516945e-07, "loss": 8.5618463344872e-05, "reward": 0.949999988079071, "reward_std": 0.09258200973272324, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.17078252136707306, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 471.375, "completions/min_length": 416.0, "epoch": 13.76764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0234174728393555, "kl": 0.010622780653648078, "learning_rate": 2.677607971304152e-07, "loss": 0.00010657310485839844, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 422.0, "completions/min_length": 362.0, "epoch": 13.769117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.08731109648942947, "kl": 0.012084940914064646, "learning_rate": 2.6764715504321636e-07, "loss": 0.00012268776481505483, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 483.75, "completions/min_length": 426.0, "epoch": 13.770588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.008441719226539135, "kl": 0.007147445110604167, "learning_rate": 2.675335282628348e-07, "loss": 7.18687369953841e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/mean_length": 419.75, "completions/min_length": 367.0, "epoch": 13.772058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.014992468059062958, "kl": 0.008746563224121928, "learning_rate": 2.6741991679675626e-07, "loss": 8.784697274677455e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 493.25, "completions/min_length": 435.0, "epoch": 13.773529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 1.4617735147476196, "kl": 0.011922233272343874, "learning_rate": 2.673063206524649e-07, "loss": 0.00011879205703735352, "reward": 0.36250001192092896, "reward_std": 0.3934735357761383, "rewards/DrugCombAccuracyCOTORM/mean": 0.25, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 9366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/mean_length": 492.6875, "completions/min_length": 368.0, "epoch": 13.775, "frac_reward_zero_std": 1.0, "grad_norm": 0.013676058501005173, "kl": 0.008936944883316755, "learning_rate": 2.671927398374443e-07, "loss": 8.938890823628753e-05, "reward": 0.800000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/mean_length": 464.9375, "completions/min_length": 391.0, "epoch": 13.776470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 0.8175296783447266, "kl": 0.00835159805137664, "learning_rate": 2.670791743591768e-07, "loss": 8.37757543195039e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/mean_length": 412.0625, "completions/min_length": 340.0, "epoch": 13.777941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.011912679299712181, "kl": 0.00614576053339988, "learning_rate": 2.669656242251439e-07, "loss": 6.126276275608689e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 451.0, "completions/min_length": 427.0, "epoch": 13.779411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.2532089948654175, "kl": 0.009872122784145176, "learning_rate": 2.668520894428259e-07, "loss": 9.900331497192383e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 445.6875, "completions/min_length": 412.0, "epoch": 13.780882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.0094446437433362, "kl": 0.006580965826287866, "learning_rate": 2.667385700197022e-07, "loss": 6.599722109967843e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 465.6875, "completions/min_length": 402.0, "epoch": 13.782352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.96175616979599, "kl": 0.014958210056647658, "learning_rate": 2.666250659632515e-07, "loss": 0.00014931336045265198, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 402.125, "completions/min_length": 352.0, "epoch": 13.783823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.02180304005742073, "kl": 0.011095887864939868, "learning_rate": 2.6651157728095064e-07, "loss": 0.00011148669000249356, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 439.375, "completions/min_length": 368.0, "epoch": 13.785294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.8648571968078613, "kl": 0.010045806178823113, "learning_rate": 2.663981039802761e-07, "loss": 0.00010218191891908646, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/mean_length": 490.375, "completions/min_length": 405.0, "epoch": 13.786764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.1519076824188232, "kl": 0.010120680090039968, "learning_rate": 2.6628464606870336e-07, "loss": 0.00010107713023899123, "reward": 0.8476190567016602, "reward_std": 0.19579315185546875, "rewards/DrugCombAccuracyCOTORM/mean": 0.8303571939468384, "rewards/DrugCombAccuracyCOTORM/std": 0.3308153450489044, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 9375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 433.875, "completions/min_length": 322.0, "epoch": 13.788235294117648, "frac_reward_zero_std": 0.0, "grad_norm": 1.3569234609603882, "kl": 0.011694313958287239, "learning_rate": 2.6617120355370663e-07, "loss": 0.00011782348155975342, "reward": 0.6161166429519653, "reward_std": 0.3788818418979645, "rewards/DrugCombAccuracyCOTORM/mean": 0.5508750081062317, "rewards/DrugCombAccuracyCOTORM/std": 0.46979427337646484, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7541666626930237, "rewards/DrugCombCoverageCOTORM/std": 0.3448832631111145, "step": 9376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 439.75, "completions/min_length": 399.0, "epoch": 13.78970588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.7994120121002197, "kl": 0.009733804035931826, "learning_rate": 2.660577764427593e-07, "loss": 9.701400995254517e-05, "reward": 0.780750036239624, "reward_std": 0.19219204783439636, "rewards/DrugCombAccuracyCOTORM/mean": 0.7493749856948853, "rewards/DrugCombAccuracyCOTORM/std": 0.38815969228744507, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.49581584334373474, "step": 9377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 452.5, "completions/min_length": 378.0, "epoch": 13.791176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.012464085593819618, "kl": 0.007157400134019554, "learning_rate": 2.659443647433336e-07, "loss": 7.186405127868056e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 433.5625, "completions/min_length": 369.0, "epoch": 13.79264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.1414347887039185, "kl": 0.009944290155544877, "learning_rate": 2.6583096846290087e-07, "loss": 9.932211833074689e-05, "reward": 0.7052916884422302, "reward_std": 0.1147107183933258, "rewards/DrugCombAccuracyCOTORM/mean": 0.6544010639190674, "rewards/DrugCombAccuracyCOTORM/std": 0.3946917653083801, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.27216553688049316, "step": 9379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/mean_length": 411.0, "completions/min_length": 348.0, "epoch": 13.794117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.014462127350270748, "kl": 0.007900180062279105, "learning_rate": 2.6571758760893125e-07, "loss": 7.84452713560313e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 488.625, "completions/min_length": 424.0, "epoch": 13.795588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.127321481704712, "kl": 0.00915657717268914, "learning_rate": 2.6560422218889413e-07, "loss": 9.167202370008454e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 481.25, "completions/min_length": 385.0, "epoch": 13.797058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 1.5689823627471924, "kl": 0.008135581039823592, "learning_rate": 2.654908722102577e-07, "loss": 8.146464824676514e-05, "reward": 0.8812500238418579, "reward_std": 0.3358757197856903, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.5439056158065796, "step": 9382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/mean_length": 416.0, "completions/min_length": 337.0, "epoch": 13.798529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.01653056964278221, "kl": 0.00842067354824394, "learning_rate": 2.653775376804891e-07, "loss": 8.345593232661486e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/mean_length": 485.8125, "completions/min_length": 425.0, "epoch": 13.8, "frac_reward_zero_std": 0.5, "grad_norm": 1.2927074432373047, "kl": 0.010036964900791645, "learning_rate": 2.6526421860705473e-07, "loss": 0.00010047107934951782, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 448.0625, "completions/min_length": 401.0, "epoch": 13.801470588235293, "frac_reward_zero_std": 1.0, "grad_norm": 0.01595105417072773, "kl": 0.015308947302401066, "learning_rate": 2.651509149974194e-07, "loss": 0.0001534912153147161, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 435.875, "completions/min_length": 375.0, "epoch": 13.802941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.088287591934204, "kl": 0.01641911454498768, "learning_rate": 2.650376268590475e-07, "loss": 0.0001647062599658966, "reward": 0.6079999804496765, "reward_std": 0.054230544716119766, "rewards/DrugCombAccuracyCOTORM/mean": 0.5412499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.4801371693611145, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.3333333432674408, "step": 9386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 464.25, "completions/min_length": 400.0, "epoch": 13.804411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.01591302827000618, "kl": 0.009907599771395326, "learning_rate": 2.6492435419940213e-07, "loss": 9.8510216048453e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 460.875, "completions/min_length": 351.0, "epoch": 13.805882352941177, "frac_reward_zero_std": 0.0, "grad_norm": 1.56459379196167, "kl": 0.013441836461424828, "learning_rate": 2.6481109702594537e-07, "loss": 0.00013097375631332397, "reward": 0.38749998807907104, "reward_std": 0.44393861293792725, "rewards/DrugCombAccuracyCOTORM/mean": 0.3125, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.375, "rewards/DrugCombCoverageCOTORM/std": 0.9574271440505981, "step": 9388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 427.0625, "completions/min_length": 361.0, "epoch": 13.80735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.3369535207748413, "kl": 0.010845356271602213, "learning_rate": 2.6469785534613836e-07, "loss": 0.00010830536484718323, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 9389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 427.0, "completions/min_length": 380.0, "epoch": 13.808823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.022176777943968773, "kl": 0.010789487045258284, "learning_rate": 2.6458462916744117e-07, "loss": 0.00010711729555623606, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 429.4375, "completions/min_length": 352.0, "epoch": 13.810294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.022398585453629494, "kl": 0.011623340891674161, "learning_rate": 2.644714184973128e-07, "loss": 0.00011743436334654689, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 479.125, "completions/min_length": 403.0, "epoch": 13.811764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.01880645751953125, "kl": 0.00816052861046046, "learning_rate": 2.6435822334321143e-07, "loss": 8.144525054376572e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 494.4375, "completions/min_length": 433.0, "epoch": 13.813235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 0.9847210049629211, "kl": 0.010169528657570481, "learning_rate": 2.642450437125939e-07, "loss": 0.00010031714919023216, "reward": 0.9750000238418579, "reward_std": 0.0707106739282608, "rewards/DrugCombAccuracyCOTORM/mean": 0.96875, "rewards/DrugCombAccuracyCOTORM/std": 0.125, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 461.6875, "completions/min_length": 393.0, "epoch": 13.814705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.02272382192313671, "kl": 0.007771062781102955, "learning_rate": 2.641318796129163e-07, "loss": 7.766264752717689e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 427.5625, "completions/min_length": 379.0, "epoch": 13.816176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.7989005446434021, "kl": 0.009587233187630773, "learning_rate": 2.6401873105163355e-07, "loss": 9.624323865864426e-05, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/mean_length": 477.4375, "completions/min_length": 383.0, "epoch": 13.81764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9571278095245361, "kl": 0.011177547508850694, "learning_rate": 2.639055980361998e-07, "loss": 0.00011171400547027588, "reward": 0.874250054359436, "reward_std": 0.07944583147764206, "rewards/DrugCombAccuracyCOTORM/mean": 0.8610416650772095, "rewards/DrugCombAccuracyCOTORM/std": 0.18931345641613007, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.21836937963962555, "step": 9396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 477.375, "completions/min_length": 430.0, "epoch": 13.819117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.014688778668642044, "kl": 0.009547976078465581, "learning_rate": 2.6379248057406753e-07, "loss": 9.510162635706365e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/mean_length": 397.25, "completions/min_length": 338.0, "epoch": 13.820588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.009406010620296001, "kl": 0.006487637059763074, "learning_rate": 2.6367937867268896e-07, "loss": 6.49432186037302e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 440.9375, "completions/min_length": 389.0, "epoch": 13.822058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.3200873136520386, "kl": 0.009777233470231295, "learning_rate": 2.635662923395148e-07, "loss": 9.724889969220385e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 442.3125, "completions/min_length": 387.0, "epoch": 13.823529411764707, "frac_reward_zero_std": 0.5, "grad_norm": 1.080880880355835, "kl": 0.009107751888222992, "learning_rate": 2.63453221581995e-07, "loss": 9.172409772872925e-05, "reward": 0.4723958373069763, "reward_std": 0.16509583592414856, "rewards/DrugCombAccuracyCOTORM/mean": 0.4166666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.49441325664520264, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.40625, "rewards/DrugCombCoverageCOTORM/std": 0.4905354380607605, "step": 9400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/mean_length": 463.8125, "completions/min_length": 438.0, "epoch": 13.825, "frac_reward_zero_std": 0.5, "grad_norm": 1.11464262008667, "kl": 0.010181164601817727, "learning_rate": 2.633401664075783e-07, "loss": 0.00010147690773010254, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 422.375, "completions/min_length": 378.0, "epoch": 13.826470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.011556347832083702, "kl": 0.008567876066081226, "learning_rate": 2.632271268237126e-07, "loss": 8.593171514803544e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 489.9375, "completions/min_length": 382.0, "epoch": 13.827941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.009207479655742645, "kl": 0.007147567346692085, "learning_rate": 2.6311410283784453e-07, "loss": 7.146307325456291e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 418.875, "completions/min_length": 353.0, "epoch": 13.829411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.011482499539852142, "kl": 0.00889713445212692, "learning_rate": 2.6300109445741993e-07, "loss": 8.828003774397075e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 421.8125, "completions/min_length": 370.0, "epoch": 13.830882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.022640526294708252, "kl": 0.011706040473654866, "learning_rate": 2.628881016898834e-07, "loss": 0.00011614998220466077, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 509.375, "completions/min_length": 432.0, "epoch": 13.83235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9309366345405579, "kl": 0.008530437597073615, "learning_rate": 2.627751245426787e-07, "loss": 8.569976489525288e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 9406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 433.1875, "completions/min_length": 396.0, "epoch": 13.833823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.017050614580512047, "kl": 0.008032247656956315, "learning_rate": 2.626621630232484e-07, "loss": 8.083461580099538e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/mean_length": 423.625, "completions/min_length": 352.0, "epoch": 13.83529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.0717260837554932, "kl": 0.011679985094815493, "learning_rate": 2.625492171390344e-07, "loss": 0.00011797652405221015, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 9408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 480.1875, "completions/min_length": 437.0, "epoch": 13.836764705882352, "frac_reward_zero_std": 0.5, "grad_norm": 1.0442843437194824, "kl": 0.01075797202065587, "learning_rate": 2.624362868974769e-07, "loss": 0.00010724365711212158, "reward": 0.5630208253860474, "reward_std": 0.04395149275660515, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 0.96875, "rewards/DrugCombCOTFormatORM/std": 0.125, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6458333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.6935549974441528, "step": 9409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 421.0, "completions/min_length": 373.0, "epoch": 13.838235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.015190502628684044, "kl": 0.008139524725265801, "learning_rate": 2.623233723060156e-07, "loss": 8.119195263134316e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 460.0625, "completions/min_length": 412.0, "epoch": 13.839705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.13652765750885, "kl": 0.01121720066294074, "learning_rate": 2.6221047337208907e-07, "loss": 0.00011192075908184052, "reward": 0.6602500081062317, "reward_std": 0.20330186188220978, "rewards/DrugCombAccuracyCOTORM/mean": 0.6456249952316284, "rewards/DrugCombAccuracyCOTORM/std": 0.47505396604537964, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.4375, "rewards/DrugCombCoverageCOTORM/std": 0.7932003140449524, "step": 9411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 427.9375, "completions/min_length": 372.0, "epoch": 13.841176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.0306490659713745, "kl": 0.01257944549433887, "learning_rate": 2.620975901031348e-07, "loss": 0.0001246631145477295, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 9412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 463.125, "completions/min_length": 425.0, "epoch": 13.842647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.0357390642166138, "kl": 0.01264697010628879, "learning_rate": 2.6198472250658923e-07, "loss": 0.00012604892253875732, "reward": 0.25, "reward_std": 0.1414213627576828, "rewards/DrugCombAccuracyCOTORM/mean": 0.0625, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 414.5, "completions/min_length": 342.0, "epoch": 13.844117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.8790192008018494, "kl": 0.009938473580405116, "learning_rate": 2.618718705898877e-07, "loss": 9.947663056664169e-05, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/mean_length": 467.8125, "completions/min_length": 414.0, "epoch": 13.845588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.8326094150543213, "kl": 0.01327540201600641, "learning_rate": 2.6175903436046474e-07, "loss": 0.00013334002869669348, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 416.5625, "completions/min_length": 355.0, "epoch": 13.847058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.040973156690597534, "kl": 0.009035779046826065, "learning_rate": 2.6164621382575383e-07, "loss": 8.983060251921415e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 500.4375, "completions/min_length": 448.0, "epoch": 13.848529411764705, "frac_reward_zero_std": 0.0, "grad_norm": 1.285046100616455, "kl": 0.011747694574296474, "learning_rate": 2.61533408993187e-07, "loss": 0.00011752545833587646, "reward": 0.8989583253860474, "reward_std": 0.24278132617473602, "rewards/DrugCombAccuracyCOTORM/mean": 0.8854166865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.2770128548145294, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.90625, "rewards/DrugCombCoverageCOTORM/std": 0.2719528079032898, "step": 9417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 462.3125, "completions/min_length": 394.0, "epoch": 13.85, "frac_reward_zero_std": 1.0, "grad_norm": 0.3250328302383423, "kl": 0.020139389554969966, "learning_rate": 2.6142061987019574e-07, "loss": 0.00019982337835244834, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 449.5, "completions/min_length": 351.0, "epoch": 13.851470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.011225888505578041, "kl": 0.006611187709495425, "learning_rate": 2.6130784646421034e-07, "loss": 6.568680691998452e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 473.25, "completions/min_length": 436.0, "epoch": 13.852941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 1.0582211017608643, "kl": 0.01130117499269545, "learning_rate": 2.6119508878266004e-07, "loss": 0.00011414289474487305, "reward": 0.8802083730697632, "reward_std": 0.10019201785326004, "rewards/DrugCombAccuracyCOTORM/mean": 0.8541666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.22669117152690887, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.96875, "rewards/DrugCombCoverageCOTORM/std": 0.125, "step": 9420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 504.6875, "completions/min_length": 441.0, "epoch": 13.854411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9884533882141113, "kl": 0.010717732599005103, "learning_rate": 2.6108234683297283e-07, "loss": 0.0001068413257598877, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 9421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/mean_length": 509.0625, "completions/min_length": 420.0, "epoch": 13.855882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 2.2314670085906982, "kl": 0.011728778015822172, "learning_rate": 2.60969620622576e-07, "loss": 0.00011715292930603027, "reward": 0.688326358795166, "reward_std": 0.24759897589683533, "rewards/DrugCombAccuracyCOTORM/mean": 0.6197395920753479, "rewards/DrugCombAccuracyCOTORM/std": 0.3984372615814209, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9253472089767456, "rewards/DrugCombCoverageCOTORM/std": 0.0947265699505806, "step": 9422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 491.125, "completions/min_length": 445.0, "epoch": 13.85735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8803617358207703, "kl": 0.009916019742377102, "learning_rate": 2.608569101588956e-07, "loss": 9.846070315688848e-05, "reward": 0.9589166641235352, "reward_std": 0.11620122194290161, "rewards/DrugCombAccuracyCOTORM/mean": 0.9512500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.19500000774860382, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 9423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/mean_length": 397.5625, "completions/min_length": 365.0, "epoch": 13.858823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.08737102150917053, "kl": 0.012119118589907885, "learning_rate": 2.607442154493568e-07, "loss": 0.0001222830469487235, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 487.625, "completions/min_length": 435.0, "epoch": 13.860294117647058, "frac_reward_zero_std": 0.0, "grad_norm": 1.7472965717315674, "kl": 0.015033591073006392, "learning_rate": 2.6063153650138356e-07, "loss": 0.00015023350715637207, "reward": 0.7708125114440918, "reward_std": 0.23937739431858063, "rewards/DrugCombAccuracyCOTORM/mean": 0.7213281393051147, "rewards/DrugCombAccuracyCOTORM/std": 0.3301100730895996, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.08333335071802139, "step": 9425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/mean_length": 397.5625, "completions/min_length": 339.0, "epoch": 13.861764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.0193212032318115, "kl": 0.015052264905534685, "learning_rate": 2.6051887332239895e-07, "loss": 0.0001509338617324829, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 9426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 391.125, "completions/min_length": 345.0, "epoch": 13.863235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.35815051198005676, "kl": 0.016880530631169677, "learning_rate": 2.6040622591982495e-07, "loss": 0.00017074814240913838, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/mean_length": 462.75, "completions/min_length": 392.0, "epoch": 13.864705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.010925950482487679, "kl": 0.0065114680910483, "learning_rate": 2.6029359430108244e-07, "loss": 6.546240911120549e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 505.5, "completions/min_length": 413.0, "epoch": 13.866176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.8299677968025208, "kl": 0.010200212709605694, "learning_rate": 2.601809784735913e-07, "loss": 0.00010236832167720422, "reward": 0.31041669845581055, "reward_std": 0.019287927076220512, "rewards/DrugCombAccuracyCOTORM/mean": 0.2708333432674408, "rewards/DrugCombAccuracyCOTORM/std": 0.28463754057884216, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.0625, "rewards/DrugCombCoverageCOTORM/std": 0.981070876121521, "step": 9429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 433.5, "completions/min_length": 374.0, "epoch": 13.867647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.02778523787856102, "kl": 0.010762413265183568, "learning_rate": 2.6006837844477035e-07, "loss": 0.00010749633656814694, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 435.3125, "completions/min_length": 371.0, "epoch": 13.869117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.9750983119010925, "kl": 0.00991739984601736, "learning_rate": 2.599557942220375e-07, "loss": 9.93087887763977e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/mean_length": 481.0625, "completions/min_length": 370.0, "epoch": 13.870588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.992331862449646, "kl": 0.013695543864741921, "learning_rate": 2.5984322581280967e-07, "loss": 0.0001474400924053043, "reward": 0.987500011920929, "reward_std": 0.023145489394664764, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 9432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 471.5, "completions/min_length": 386.0, "epoch": 13.87205882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.008628234267234802, "kl": 0.008458300144411623, "learning_rate": 2.597306732245021e-07, "loss": 8.404359687119722e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 461.75, "completions/min_length": 395.0, "epoch": 13.873529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.013902680948376656, "kl": 0.008861471200361848, "learning_rate": 2.596181364645298e-07, "loss": 8.8349319412373e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/mean_length": 559.3125, "completions/min_length": 465.0, "epoch": 13.875, "frac_reward_zero_std": 0.0, "grad_norm": 1.2533109188079834, "kl": 0.011031950591132045, "learning_rate": 2.595056155403063e-07, "loss": 0.00011149048805236816, "reward": 0.530245840549469, "reward_std": 0.43550360202789307, "rewards/DrugCombAccuracyCOTORM/mean": 0.4801250100135803, "rewards/DrugCombAccuracyCOTORM/std": 0.4816882908344269, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.46145832538604736, "rewards/DrugCombCoverageCOTORM/std": 0.874149739742279, "step": 9435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 485.375, "completions/min_length": 398.0, "epoch": 13.876470588235295, "frac_reward_zero_std": 0.5, "grad_norm": 1.9234097003936768, "kl": 0.015522000147029757, "learning_rate": 2.5939311045924417e-07, "loss": 0.00015542491746600717, "reward": 0.9604166746139526, "reward_std": 0.07329408079385757, "rewards/DrugCombAccuracyCOTORM/mean": 0.9583333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.11385500431060791, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.17078252136707306, "step": 9436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 451.0625, "completions/min_length": 409.0, "epoch": 13.87794117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.012930241413414478, "kl": 0.007588213426060975, "learning_rate": 2.592806212287551e-07, "loss": 7.59198737796396e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/mean_length": 499.0625, "completions/min_length": 400.0, "epoch": 13.879411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.2708075046539307, "kl": 0.011621693382039666, "learning_rate": 2.591681478562495e-07, "loss": 0.00011733919382095337, "reward": 0.7750000357627869, "reward_std": 0.24053511023521423, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.44721361994743347, "step": 9438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 431.6875, "completions/min_length": 364.0, "epoch": 13.880882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.012952733784914017, "kl": 0.00810647220350802, "learning_rate": 2.590556903491369e-07, "loss": 8.095158409560099e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 489.3125, "completions/min_length": 446.0, "epoch": 13.882352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.8684767484664917, "kl": 0.010480618569999933, "learning_rate": 2.5894324871482555e-07, "loss": 0.0001052144289133139, "reward": 0.9437500238418579, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 9440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 421.875, "completions/min_length": 385.0, "epoch": 13.883823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.03215659037232399, "kl": 0.009833522024564445, "learning_rate": 2.58830822960723e-07, "loss": 9.971598046831787e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 437.1875, "completions/min_length": 358.0, "epoch": 13.885294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 1.0255937576293945, "kl": 0.017675424925982952, "learning_rate": 2.5871841309423553e-07, "loss": 0.00017715053400024772, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 487.375, "completions/min_length": 381.0, "epoch": 13.886764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.3251041173934937, "kl": 0.011275654658675194, "learning_rate": 2.5860601912276834e-07, "loss": 0.00011142343282699585, "reward": 0.71875, "reward_std": 0.39963412284851074, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.6875, "rewards/DrugCombCoverageCOTORM/std": 0.4787135720252991, "step": 9443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 488.875, "completions/min_length": 426.0, "epoch": 13.888235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 1.2642244100570679, "kl": 0.009986110497266054, "learning_rate": 2.5849364105372593e-07, "loss": 0.00010052323341369629, "reward": 0.6085833311080933, "reward_std": 0.22800827026367188, "rewards/DrugCombAccuracyCOTORM/mean": 0.534166693687439, "rewards/DrugCombAccuracyCOTORM/std": 0.4525999128818512, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.20069323480129242, "step": 9444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 511.3125, "completions/min_length": 445.0, "epoch": 13.889705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 0.8876588940620422, "kl": 0.012108942959457636, "learning_rate": 2.583812788945111e-07, "loss": 0.00012159397738287225, "reward": 0.45266667008399963, "reward_std": 0.20593512058258057, "rewards/DrugCombAccuracyCOTORM/mean": 0.32625001668930054, "rewards/DrugCombAccuracyCOTORM/std": 0.47225525975227356, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.25819888710975647, "step": 9445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 459.625, "completions/min_length": 392.0, "epoch": 13.891176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.903525710105896, "kl": 0.01060530822724104, "learning_rate": 2.582689326525262e-07, "loss": 0.0001053735613822937, "reward": 0.7895833253860474, "reward_std": 0.1473138928413391, "rewards/DrugCombAccuracyCOTORM/mean": 0.7708333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.26440009474754333, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7291666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.25, "step": 9446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/mean_length": 518.875, "completions/min_length": 410.0, "epoch": 13.89264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8971433639526367, "kl": 0.012110088486224413, "learning_rate": 2.5815660233517225e-07, "loss": 0.00012063980102539062, "reward": 0.6371762156486511, "reward_std": 0.1488337516784668, "rewards/DrugCombAccuracyCOTORM/mean": 0.5933452248573303, "rewards/DrugCombAccuracyCOTORM/std": 0.467995285987854, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.670820415019989, "step": 9447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 461.5, "completions/min_length": 391.0, "epoch": 13.894117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.014595743268728256, "kl": 0.010032033082097769, "learning_rate": 2.5804428794984923e-07, "loss": 0.00010042058420367539, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 476.5, "completions/min_length": 414.0, "epoch": 13.895588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.8700894117355347, "kl": 0.01215771003626287, "learning_rate": 2.5793198950395633e-07, "loss": 0.0001224428415298462, "reward": 0.8025000095367432, "reward_std": 0.18670706450939178, "rewards/DrugCombAccuracyCOTORM/mean": 0.800000011920929, "rewards/DrugCombAccuracyCOTORM/std": 0.3265986442565918, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.6540472507476807, "step": 9449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 478.75, "completions/min_length": 440.0, "epoch": 13.897058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 0.9617879986763, "kl": 0.009320558630861342, "learning_rate": 2.5781970700489126e-07, "loss": 9.301870159106329e-05, "reward": 0.699999988079071, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 498.75, "completions/min_length": 454.0, "epoch": 13.898529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.7191188335418701, "kl": 0.008868925273418427, "learning_rate": 2.5770744046005104e-07, "loss": 8.871007594279945e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/mean_length": 408.3125, "completions/min_length": 377.0, "epoch": 13.9, "frac_reward_zero_std": 1.0, "grad_norm": 0.016080310568213463, "kl": 0.00652341969544068, "learning_rate": 2.575951898768315e-07, "loss": 6.502816540887579e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 470.5625, "completions/min_length": 418.0, "epoch": 13.901470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.01083430927246809, "kl": 0.007117051514796913, "learning_rate": 2.5748295526262736e-07, "loss": 7.085315883159637e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 461.75, "completions/min_length": 398.0, "epoch": 13.902941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.9454535841941833, "kl": 0.011452495586127043, "learning_rate": 2.5737073662483244e-07, "loss": 0.00011403357348171994, "reward": 0.831250011920929, "reward_std": 0.23289711773395538, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8125, "rewards/DrugCombCoverageCOTORM/std": 0.40311288833618164, "step": 9454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/mean_length": 505.8125, "completions/min_length": 385.0, "epoch": 13.904411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.9493837952613831, "kl": 0.008802420808933675, "learning_rate": 2.572585339708393e-07, "loss": 8.747920219320804e-05, "reward": 0.8547187447547913, "reward_std": 0.14095081388950348, "rewards/DrugCombAccuracyCOTORM/mean": 0.8193749785423279, "rewards/DrugCombAccuracyCOTORM/std": 0.30232641100883484, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9921875, "rewards/DrugCombCoverageCOTORM/std": 0.03125, "step": 9455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/mean_length": 542.75, "completions/min_length": 443.0, "epoch": 13.905882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.9435456395149231, "kl": 0.008286808733828366, "learning_rate": 2.571463473080399e-07, "loss": 8.288025856018066e-05, "reward": 0.7767500281333923, "reward_std": 0.17010116577148438, "rewards/DrugCombAccuracyCOTORM/mean": 0.7287499904632568, "rewards/DrugCombAccuracyCOTORM/std": 0.2766435742378235, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.13437095284461975, "step": 9456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 443.3125, "completions/min_length": 329.0, "epoch": 13.907352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.0079838037490845, "kl": 0.010005400981754065, "learning_rate": 2.570341766438243e-07, "loss": 9.976152796298265e-05, "reward": 0.8374999761581421, "reward_std": 0.22638463973999023, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 9457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 467.3125, "completions/min_length": 409.0, "epoch": 13.908823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.7873241901397705, "kl": 0.009911375935189426, "learning_rate": 2.5692202198558236e-07, "loss": 9.845569729804993e-05, "reward": 0.7425000071525574, "reward_std": 0.16180676221847534, "rewards/DrugCombAccuracyCOTORM/mean": 0.6937500238418579, "rewards/DrugCombAccuracyCOTORM/std": 0.41161268949508667, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.1666666567325592, "step": 9458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 454.0, "completions/min_length": 393.0, "epoch": 13.910294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.05544362962245941, "kl": 0.008669131435453892, "learning_rate": 2.568098833407024e-07, "loss": 8.710185647942126e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 9459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 427.875, "completions/min_length": 333.0, "epoch": 13.911764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8436938524246216, "kl": 0.00862608477473259, "learning_rate": 2.566977607165719e-07, "loss": 8.570030331611633e-05, "reward": 0.5854166746139526, "reward_std": 0.02260337956249714, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.3435921370983124, "step": 9460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/mean_length": 450.1875, "completions/min_length": 368.0, "epoch": 13.913235294117648, "frac_reward_zero_std": 0.5, "grad_norm": 1.0132372379302979, "kl": 0.009340123971924186, "learning_rate": 2.565856541205772e-07, "loss": 9.362946730107069e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 466.0625, "completions/min_length": 412.0, "epoch": 13.91470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 3.217221736907959, "kl": 0.014974863268435001, "learning_rate": 2.5647356356010363e-07, "loss": 0.0001493285526521504, "reward": 0.8999999761581421, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 449.875, "completions/min_length": 416.0, "epoch": 13.916176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.016868190839886665, "kl": 0.007724436582066119, "learning_rate": 2.563614890425354e-07, "loss": 7.744962931610644e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/mean_length": 505.5, "completions/min_length": 434.0, "epoch": 13.91764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.3281960487365723, "kl": 0.011063659330829978, "learning_rate": 2.562494305752557e-07, "loss": 0.00011049211025238037, "reward": 0.8145833611488342, "reward_std": 0.27259859442710876, "rewards/DrugCombAccuracyCOTORM/mean": 0.7708333730697632, "rewards/DrugCombAccuracyCOTORM/std": 0.39849257469177246, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.05692751333117485, "step": 9464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 447.5625, "completions/min_length": 385.0, "epoch": 13.919117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.8428249955177307, "kl": 0.0071417519357055426, "learning_rate": 2.561373881656466e-07, "loss": 7.128405559342355e-05, "reward": 0.887499988079071, "reward_std": 0.20830951631069183, "rewards/DrugCombAccuracyCOTORM/mean": 0.875, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.3415650427341461, "step": 9465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/mean_length": 523.1875, "completions/min_length": 398.0, "epoch": 13.920588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.6153276562690735, "kl": 0.008721170364879072, "learning_rate": 2.5602536182108923e-07, "loss": 8.713454008102417e-05, "reward": 0.75, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.6875, "rewards/DrugCombAccuracyCOTORM/std": 0.4787135720252991, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 463.0625, "completions/min_length": 386.0, "epoch": 13.922058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.9073415994644165, "kl": 0.009337034076452255, "learning_rate": 2.5591335154896364e-07, "loss": 9.345217404188588e-05, "reward": 0.949999988079071, "reward_std": 0.09258200973272324, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.17078252136707306, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 508.625, "completions/min_length": 451.0, "epoch": 13.923529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.015338276512920856, "kl": 0.008883593254722655, "learning_rate": 2.55801357356649e-07, "loss": 8.919646643335e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 454.9375, "completions/min_length": 424.0, "epoch": 13.925, "frac_reward_zero_std": 1.0, "grad_norm": 0.010784550569951534, "kl": 0.008460283977910876, "learning_rate": 2.556893792515227e-07, "loss": 8.470108878100291e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 490.375, "completions/min_length": 401.0, "epoch": 13.926470588235293, "frac_reward_zero_std": 0.5, "grad_norm": 0.8743817210197449, "kl": 0.010985235800035298, "learning_rate": 2.555774172409618e-07, "loss": 0.00011016235657734796, "reward": 0.8354166746139526, "reward_std": 0.22894470393657684, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.5013870000839233, "step": 9470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/mean_length": 466.75, "completions/min_length": 376.0, "epoch": 13.927941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 0.7649944424629211, "kl": 0.00853771751280874, "learning_rate": 2.554654713323421e-07, "loss": 8.447188884019852e-05, "reward": 0.7348541617393494, "reward_std": 0.15908417105674744, "rewards/DrugCombAccuracyCOTORM/mean": 0.6894010305404663, "rewards/DrugCombAccuracyCOTORM/std": 0.40721777081489563, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.3442651927471161, "step": 9471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 471.75, "completions/min_length": 431.0, "epoch": 13.929411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.9745084643363953, "kl": 0.011053995694965124, "learning_rate": 2.553535415330382e-07, "loss": 0.00011015774362022057, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.625, "rewards/DrugCombCoverageCOTORM/std": 0.8062257766723633, "step": 9472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/mean_length": 454.375, "completions/min_length": 374.0, "epoch": 13.930882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.9156201481819153, "kl": 0.009979538968764246, "learning_rate": 2.55241627850424e-07, "loss": 9.893998503684998e-05, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 458.125, "completions/min_length": 409.0, "epoch": 13.93235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.013919485732913017, "kl": 0.008134670672006905, "learning_rate": 2.5512973029187195e-07, "loss": 8.152496593538672e-05, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5, "rewards/DrugCombCoverageCOTORM/std": 0.5163977742195129, "step": 9474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/mean_length": 484.1875, "completions/min_length": 395.0, "epoch": 13.933823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.010652714408934116, "kl": 0.008191333152353764, "learning_rate": 2.5501784886475356e-07, "loss": 8.25419119792059e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 440.4375, "completions/min_length": 392.0, "epoch": 13.935294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.0382064580917358, "kl": 0.01195059483870864, "learning_rate": 2.549059835764393e-07, "loss": 0.00011904165148735046, "reward": 0.8500000238418579, "reward_std": 0.20701968669891357, "rewards/DrugCombAccuracyCOTORM/mean": 0.8125, "rewards/DrugCombAccuracyCOTORM/std": 0.40311288833618164, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 446.875, "completions/min_length": 404.0, "epoch": 13.936764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.03151604160666466, "kl": 0.009678757400251925, "learning_rate": 2.5479413443429866e-07, "loss": 9.656189649831504e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 442.4375, "completions/min_length": 356.0, "epoch": 13.938235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 0.7828906774520874, "kl": 0.008144225226715207, "learning_rate": 2.546823014456998e-07, "loss": 8.106231689453125e-05, "reward": 0.5943333506584167, "reward_std": 0.03111269511282444, "rewards/DrugCombAccuracyCOTORM/mean": 0.5137500166893005, "rewards/DrugCombAccuracyCOTORM/std": 0.5050000548362732, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8333333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.17213258147239685, "step": 9478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 433.5625, "completions/min_length": 358.0, "epoch": 13.939705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.8768564462661743, "kl": 0.008904768386855721, "learning_rate": 2.545704846180101e-07, "loss": 8.910894393920898e-05, "reward": 0.460812509059906, "reward_std": 0.11083899438381195, "rewards/DrugCombAccuracyCOTORM/mean": 0.4529687464237213, "rewards/DrugCombAccuracyCOTORM/std": 0.5018875002861023, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": -0.015625, "rewards/DrugCombCoverageCOTORM/std": 1.0184496641159058, "step": 9479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 484.3125, "completions/min_length": 412.0, "epoch": 13.941176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 1.3270479440689087, "kl": 0.009467821568250656, "learning_rate": 2.5445868395859603e-07, "loss": 9.456276893615723e-05, "reward": 0.44999998807907104, "reward_std": 0.38508620858192444, "rewards/DrugCombAccuracyCOTORM/mean": 0.4375, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 0.9660918116569519, "step": 9480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 782.0, "completions/mean_length": 597.25, "completions/min_length": 454.0, "epoch": 13.94264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9761524796485901, "kl": 0.010236491914838552, "learning_rate": 2.5434689947482224e-07, "loss": 0.00010323501192033291, "reward": 0.5777778029441833, "reward_std": 0.011596539989113808, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7777777910232544, "rewards/DrugCombCoverageCOTORM/std": 0.27888667583465576, "step": 9481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 405.25, "completions/min_length": 338.0, "epoch": 13.944117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 0.9217087030410767, "kl": 0.009641347685828805, "learning_rate": 2.5423513117405304e-07, "loss": 9.412318468093872e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 489.1875, "completions/min_length": 377.0, "epoch": 13.945588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.01032722182571888, "kl": 0.0067691372241824865, "learning_rate": 2.5412337906365143e-07, "loss": 6.759694952052087e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 484.75, "completions/min_length": 391.0, "epoch": 13.947058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 4.933511734008789, "kl": 0.11891153955366462, "learning_rate": 2.5401164315097945e-07, "loss": 0.0011905208230018616, "reward": 0.8083666563034058, "reward_std": 0.20491597056388855, "rewards/DrugCombAccuracyCOTORM/mean": 0.7802500128746033, "rewards/DrugCombAccuracyCOTORM/std": 0.3931417763233185, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8416666984558105, "rewards/DrugCombCoverageCOTORM/std": 0.2837578356266022, "step": 9484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 426.625, "completions/min_length": 389.0, "epoch": 13.948529411764707, "frac_reward_zero_std": 1.0, "grad_norm": 0.017247667536139488, "kl": 0.009075228008441627, "learning_rate": 2.538999234433978e-07, "loss": 9.08706642803736e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/mean_length": 518.0, "completions/min_length": 410.0, "epoch": 13.95, "frac_reward_zero_std": 0.5, "grad_norm": 0.9308747053146362, "kl": 0.011635417700745165, "learning_rate": 2.537882199482665e-07, "loss": 0.0001184977445518598, "reward": 0.7782738208770752, "reward_std": 0.1366949826478958, "rewards/DrugCombAccuracyCOTORM/mean": 0.7410714626312256, "rewards/DrugCombAccuracyCOTORM/std": 0.34458065032958984, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8541666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.24247948825359344, "step": 9486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/mean_length": 441.1875, "completions/min_length": 364.0, "epoch": 13.951470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.012577979825437069, "kl": 0.010918765095993876, "learning_rate": 2.5367653267294406e-07, "loss": 0.00010827495134435594, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 512.25, "completions/min_length": 423.0, "epoch": 13.952941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.027556266635656357, "kl": 0.009197501698508859, "learning_rate": 2.535648616247884e-07, "loss": 9.033295646077022e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 443.9375, "completions/min_length": 371.0, "epoch": 13.954411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 2.788461923599243, "kl": 0.009613252710551023, "learning_rate": 2.5345320681115593e-07, "loss": 9.615719318389893e-05, "reward": 0.574999988079071, "reward_std": 0.41661906242370605, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.75, "rewards/DrugCombCoverageCOTORM/std": 0.6831300854682922, "step": 9489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 456.8125, "completions/min_length": 428.0, "epoch": 13.955882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.9135085940361023, "kl": 0.008068165159784257, "learning_rate": 2.5334156823940236e-07, "loss": 8.046310540521517e-05, "reward": 0.606249988079071, "reward_std": 0.1590990275144577, "rewards/DrugCombAccuracyCOTORM/mean": 0.5625, "rewards/DrugCombAccuracyCOTORM/std": 0.5123475790023804, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.5625, "rewards/DrugCombCoverageCOTORM/std": 0.5123475790023804, "step": 9490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 469.75, "completions/min_length": 411.0, "epoch": 13.95735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01898784749209881, "kl": 0.010332665406167507, "learning_rate": 2.5322994591688194e-07, "loss": 0.00010375557030783966, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 473.4375, "completions/min_length": 400.0, "epoch": 13.958823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.4460312128067017, "kl": 0.012481984915211797, "learning_rate": 2.5311833985094845e-07, "loss": 0.0001285746693611145, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/mean_length": 510.0, "completions/min_length": 411.0, "epoch": 13.96029411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9512367844581604, "kl": 0.010414475342258811, "learning_rate": 2.5300675004895366e-07, "loss": 0.0001050389400916174, "reward": 0.7000000476837158, "reward_std": 0.18516401946544647, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.3415650427341461, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/mean_length": 464.3125, "completions/min_length": 385.0, "epoch": 13.961764705882352, "frac_reward_zero_std": 0.5, "grad_norm": 1.1387486457824707, "kl": 0.012666403083130717, "learning_rate": 2.528951765182492e-07, "loss": 0.00012547890946734697, "reward": 0.8420833349227905, "reward_std": 0.19493992626667023, "rewards/DrugCombAccuracyCOTORM/mean": 0.8104166984558105, "rewards/DrugCombAccuracyCOTORM/std": 0.3739437162876129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9375, "rewards/DrugCombCoverageCOTORM/std": 0.13437095284461975, "step": 9494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 462.5625, "completions/min_length": 437.0, "epoch": 13.963235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0330212116241455, "kl": 0.011934574926272035, "learning_rate": 2.527836192661852e-07, "loss": 0.00011961907148361206, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 449.9375, "completions/min_length": 359.0, "epoch": 13.964705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9437747597694397, "kl": 0.012351603829301894, "learning_rate": 2.526720783001107e-07, "loss": 0.00012455078831408173, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 462.875, "completions/min_length": 371.0, "epoch": 13.966176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.8809987306594849, "kl": 0.010429357294924557, "learning_rate": 2.525605536273737e-07, "loss": 0.00010386109352111816, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 468.9375, "completions/min_length": 389.0, "epoch": 13.967647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.05525125935673714, "kl": 0.011247562244534492, "learning_rate": 2.524490452553213e-07, "loss": 0.00011314359289826825, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 474.75, "completions/min_length": 426.0, "epoch": 13.969117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.0833648443222046, "kl": 0.011136050568893552, "learning_rate": 2.5233755319129933e-07, "loss": 0.0001101832531276159, "reward": 0.800000011920929, "reward_std": 0.21380899846553802, "rewards/DrugCombAccuracyCOTORM/mean": 0.75, "rewards/DrugCombAccuracyCOTORM/std": 0.44721361994743347, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 442.6875, "completions/min_length": 392.0, "epoch": 13.970588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9671428799629211, "kl": 0.012561243493109941, "learning_rate": 2.5222607744265263e-07, "loss": 0.00012484924809541553, "reward": 0.9178333282470703, "reward_std": 0.15214310586452484, "rewards/DrugCombAccuracyCOTORM/mean": 0.9025000333786011, "rewards/DrugCombAccuracyCOTORM/std": 0.26642072200775146, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9583333730697632, "rewards/DrugCombCoverageCOTORM/std": 0.11385500431060791, "step": 9500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 433.875, "completions/min_length": 386.0, "epoch": 13.972058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.008440881036221981, "kl": 0.0072209814097732306, "learning_rate": 2.5211461801672485e-07, "loss": 7.199050742201507e-05, "reward": 0.5, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.0, "rewards/DrugCombCoverageCOTORM/std": 1.0327955484390259, "step": 9501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 430.1875, "completions/min_length": 377.0, "epoch": 13.973529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.007369222119450569, "kl": 0.0068317289697006345, "learning_rate": 2.5200317492085876e-07, "loss": 6.825727905379608e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 417.375, "completions/min_length": 366.0, "epoch": 13.975, "frac_reward_zero_std": 1.0, "grad_norm": 0.011408494785428047, "kl": 0.008394967764616013, "learning_rate": 2.5189174816239607e-07, "loss": 8.366432302864268e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 424.1875, "completions/min_length": 373.0, "epoch": 13.976470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8540357947349548, "kl": 0.008696207078173757, "learning_rate": 2.517803377486769e-07, "loss": 8.61843436723575e-05, "reward": 0.949999988079071, "reward_std": 0.1414213478565216, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/mean_length": 491.375, "completions/min_length": 428.0, "epoch": 13.977941176470589, "frac_reward_zero_std": 0.0, "grad_norm": 1.2067333459854126, "kl": 0.009088133461773396, "learning_rate": 2.516689436870409e-07, "loss": 9.138882160186768e-05, "reward": 0.6516667008399963, "reward_std": 0.1922103315591812, "rewards/DrugCombAccuracyCOTORM/mean": 0.574999988079071, "rewards/DrugCombAccuracyCOTORM/std": 0.3213789761066437, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9166666269302368, "rewards/DrugCombCoverageCOTORM/std": 0.08606630563735962, "step": 9505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/mean_length": 561.4375, "completions/min_length": 425.0, "epoch": 13.979411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.8601759076118469, "kl": 0.00819496565964073, "learning_rate": 2.515575659848266e-07, "loss": 8.248207450378686e-05, "reward": 0.7771454453468323, "reward_std": 0.12792059779167175, "rewards/DrugCombAccuracyCOTORM/mean": 0.7358415126800537, "rewards/DrugCombAccuracyCOTORM/std": 0.3430335819721222, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.8847222328186035, "rewards/DrugCombCoverageCOTORM/std": 0.2089509516954422, "step": 9506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 431.125, "completions/min_length": 381.0, "epoch": 13.980882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 1.5698742866516113, "kl": 0.0178393900860101, "learning_rate": 2.51446204649371e-07, "loss": 0.0001791343092918396, "reward": 0.6000000238418579, "reward_std": 0.37032803893089294, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/mean_length": 453.5625, "completions/min_length": 418.0, "epoch": 13.98235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01581418886780739, "kl": 0.007415974396280944, "learning_rate": 2.513348596880105e-07, "loss": 7.446029485436156e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 424.8125, "completions/min_length": 375.0, "epoch": 13.983823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.012037287466228008, "kl": 0.008076277794316411, "learning_rate": 2.5122353110808e-07, "loss": 8.057578816078603e-05, "reward": 0.6000000238418579, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 0.5, "rewards/DrugCombAccuracyCOTORM/std": 0.5163977742195129, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 454.4375, "completions/min_length": 415.0, "epoch": 13.985294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.03465652093291283, "kl": 0.010753289796411991, "learning_rate": 2.5111221891691383e-07, "loss": 0.00010870541154872626, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/mean_length": 499.25, "completions/min_length": 441.0, "epoch": 13.986764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 1.3837003707885742, "kl": 0.013077352195978165, "learning_rate": 2.510009231218448e-07, "loss": 0.0001291334629058838, "reward": 0.4426250457763672, "reward_std": 0.3423117399215698, "rewards/DrugCombAccuracyCOTORM/mean": 0.3332291841506958, "rewards/DrugCombAccuracyCOTORM/std": 0.4037328064441681, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.7604166865348816, "rewards/DrugCombCoverageCOTORM/std": 0.4035433530807495, "step": 9511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 456.375, "completions/min_length": 404.0, "epoch": 13.988235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9226711988449097, "kl": 0.008238307782448828, "learning_rate": 2.5088964373020484e-07, "loss": 8.251247345469892e-05, "reward": 0.9333333373069763, "reward_std": 0.14253932237625122, "rewards/DrugCombAccuracyCOTORM/mean": 0.9166666865348816, "rewards/DrugCombAccuracyCOTORM/std": 0.25819888710975647, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 436.9375, "completions/min_length": 397.0, "epoch": 13.989705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 0.7299228310585022, "kl": 0.007706988952122629, "learning_rate": 2.5077838074932465e-07, "loss": 7.737283158348873e-05, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/DrugCombAccuracyCOTORM/mean": 0.9375, "rewards/DrugCombAccuracyCOTORM/std": 0.25, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 9513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 430.5, "completions/min_length": 382.0, "epoch": 13.991176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.011552260257303715, "kl": 0.0091863963752985, "learning_rate": 2.5066713418653406e-07, "loss": 9.164253424387425e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 463.5, "completions/min_length": 422.0, "epoch": 13.992647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.010913575068116188, "kl": 0.008296108338981867, "learning_rate": 2.5055590404916196e-07, "loss": 8.354103192687035e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 446.75, "completions/min_length": 380.0, "epoch": 13.994117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.7629755735397339, "kl": 0.008599077933467925, "learning_rate": 2.504446903445353e-07, "loss": 8.539643022231758e-05, "reward": 0.6875, "reward_std": 0.19594095647335052, "rewards/DrugCombAccuracyCOTORM/mean": 0.625, "rewards/DrugCombAccuracyCOTORM/std": 0.5, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.875, "rewards/DrugCombCoverageCOTORM/std": 0.5, "step": 9516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/mean_length": 443.9375, "completions/min_length": 344.0, "epoch": 13.995588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.012570273131132126, "kl": 0.007898610550910234, "learning_rate": 2.50333493079981e-07, "loss": 7.864889630582184e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/mean_length": 491.625, "completions/min_length": 413.0, "epoch": 13.99705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9531858563423157, "kl": 0.010233582346700132, "learning_rate": 2.502223122628243e-07, "loss": 0.00010223709250567481, "reward": 0.9569754600524902, "reward_std": 0.12169169634580612, "rewards/DrugCombAccuracyCOTORM/mean": 0.9488235116004944, "rewards/DrugCombAccuracyCOTORM/std": 0.2047058790922165, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 0.9791666865348816, "rewards/DrugCombCoverageCOTORM/std": 0.0833333283662796, "step": 9518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/mean_length": 455.875, "completions/min_length": 339.0, "epoch": 13.998529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.023187464103102684, "kl": 0.010549088008701801, "learning_rate": 2.501111479003896e-07, "loss": 0.00010473499423824251, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 440.0, "completions/min_length": 408.0, "epoch": 14.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.014022547751665115, "kl": 0.00840020866598934, "learning_rate": 2.500000000000001e-07, "loss": 8.412348688580096e-05, "reward": 1.0, "reward_std": 0.0, "rewards/DrugCombAccuracyCOTORM/mean": 1.0, "rewards/DrugCombAccuracyCOTORM/std": 0.0, "rewards/DrugCombCOTFormatORM/mean": 1.0, "rewards/DrugCombCOTFormatORM/std": 0.0, "rewards/DrugCombCOTThinkORM/mean": 1.0, "rewards/DrugCombCOTThinkORM/std": 0.0, "rewards/DrugCombCoverageCOTORM/mean": 1.0, "rewards/DrugCombCoverageCOTORM/std": 0.0, "step": 9520 } ], "logging_steps": 1, "max_steps": 13600, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }